In [322]:
import os
import json
import boto3
import pickle
import requests
import implicit
import numpy as np
import pandas as pd

from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize

In [324]:
def precision(actual, predicted):
    return len(list(set(actual) & set(predicted))) / len(predicted)

def mean_precision(scores, n):
    return np.array(scores).sum() / n

In [280]:
def download_file(bucket, filename):
    s3 = boto3.client(
        service_name='s3',
        endpoint_url='https://s3.storage.selcloud.ru'
    )
    return s3.get_object(Bucket=bucket, Key=filename)

def upload_file(bucket, filename, filepath):
    s3 = boto3.client(
        service_name='s3',
        endpoint_url='https://s3.storage.selcloud.ru'
    )
    s3.upload_file(filename, bucket, filepath)

def get_file_last_update_date(bucket, filename):
    s3 = boto3.client(
        service_name='s3',
        endpoint_url='https://s3.storage.selcloud.ru'
    )
    return s3.get_object(Bucket=bucket, Key=filename)['LastModified']

In [334]:
def get_json(bucket, name):
    return json.load(download_file(bucket, name)['Body'])

def set_json(data, bucket, name):
    with open(name, 'w') as file:
        json.dump(data, file)
    upload_file(bucket, name, name)
    # os.remove(name)

In [249]:
def get_csv(bucket, filename):
    return pd.read_csv(download_file(bucket, filename)['Body'])

In [250]:
def get_pickle(bucket, filename):
    return pickle.load(download_file(bucket, filename)['Body'])

In [359]:
def validate_storage(bucket):
    try:
        config = get_json(bucket, 'config.json')
    except:
        config = {
            "LastModified": None, 
            "ActualModel": None, 
            "ActualScore": None, 
            "IsActual": False
        }
        set_json(config, bucket, 'config.json')
    try:
        models = get_json(bucket, 'models.json')
    except:
        models = {
            "models":[
            ]
        }
        set_json(models, bucket, 'models.json')

def check_for_updates(bucket):
    config = get_json(bucket, 'config.json')
    last_update_date = max([
            get_file_last_update_date(bucket, 'data/raw/lastfm_artist_list.csv'), 
            get_file_last_update_date(bucket, 'data/raw/lastfm_user_scrobbles.csv')
        ]
    ).strftime('%m.%d.%Y_%H:%M:%S')
    if config['LastModified'] != last_update_date:
        config['LastModified'] = last_update_date
        config['IsActual'] = False
        set_json(config, bucket, 'config.json')

def process_scrobbles(config, bucket):
    scrobbles = get_csv(bucket, 'data/raw/lastfm_user_scrobbles.csv')
    scrobbles.drop_duplicates(['user_id', 'artist_id'], inplace=True)
    users_indexes, users_positions = np.unique(scrobbles.values[:,0], return_inverse=True)
    artists_indexes, artists_positions = np.unique(scrobbles.values[:,1], return_inverse=True)
    scrobbles_sparse = csr_matrix((scrobbles.values[:,2], (users_positions, artists_positions)))
    scrobbles_sparse_normalized = normalize(scrobbles_sparse, norm='l2', axis=1)
    pickle.dump(scrobbles.groupby('user_id')['artist_id'].apply(list).to_dict(), open(f"data/{config['LastModified']}/scrobbles.pkl", 'wb'))
    pickle.dump(scrobbles_sparse_normalized, open(f"data/{config['LastModified']}/scrobbles_sparse_normalized.pkl", 'wb'))
def process_data(bucket):
    config = get_json(bucket, 'config.json')
    if config['IsActual'] == False:
        os.makedirs(f"data/{config['LastModified']}/", exist_ok=True)
        process_scrobbles(config, bucket)

def upload_scrobbles(config, bucket):
    upload_file(bucket, f"data/{config['LastModified']}/scrobbles.pkl", f"data/{config['LastModified']}/scrobbles.pkl")
    # os.remove(f"data/{config['LastModified']}/scrobbles.pkl")
    upload_file(bucket, f"data/{config['LastModified']}/scrobbles_sparse_normalized.pkl", f"data/{config['LastModified']}/scrobbles_sparse_normalized.pkl")
    # os.remove(f"data/{config['LastModified']}/scrobbles_sparse_normalized.pkl")
def upload_processed_data(bucket):
    config = get_json(bucket, 'config.json')
    if config['IsActual'] == False:
        upload_scrobbles(config, bucket)
        # os.rmdir(f"data/{config['LastModified']}")

def download_scrobbles(config, bucket):
    pickle.dump(get_pickle(bucket, f"data/{config['LastModified']}/scrobbles.pkl"), open('data/actual/scrobbles.pkl', 'wb'))
    pickle.dump(get_pickle(bucket, f"data/{config['LastModified']}/scrobbles_sparse_normalized.pkl"), open('data/actual/scrobbles_sparse_normalized.pkl', 'wb'))
def load_data(bucket):
    config = get_json(bucket, 'config.json')
    os.makedirs(f"data/actual/", exist_ok=True)
    download_scrobbles(config, bucket)

def train(bucket, name):
    model = eval(name)
    model.fit(pickle.load(open('data/actual/scrobbles_sparse_normalized.pkl', 'rb')))
    pickle.dump(model, open(f"models/{name}.pkl", 'wb'))
    config = get_json(bucket, 'config.json')
    config['IsActual'] = False
    set_json(config, bucket, 'config.json')

def get_k_similar(model, target, k):
    indices, distances = model.similar_items(target-1, N=5, filter_items=[target-1])
    return [x+1 for x in indices]
def evaluate(bucket, name):
    validation = pickle.load(open('data/actual/scrobbles.pkl', 'rb'))
    model = pickle.load(open(f"models/{name}.pkl", 'rb'))
    models = get_json(bucket, 'models.json')
    scores = []
    users = 0
    for user in validation:
        if len(validation[user]) <= 1:
            continue
        predictions = []
        for artist in validation[user]:
            predictions += get_k_similar(model, artist, 5)
        scores.append(precision(validation[user], predictions))
        users += 1
    score = mean_precision(scores, users)
    now = datetime.now().strftime('%m.%d.%Y_%H:%M:%S')
    models['models'].append({
        "name":name,
        "score":score,
        "date":now
    })
    upload_file(bucket, f"models/{name}.pkl", f"models/{name}.pkl")
    set_json(models, bucket, 'models.json')

def select_best_model(bucket):
    config = get_json(bucket, 'config.json')
    models = get_json(bucket, 'models.json')
    best_score = 0
    best_model = ''
    for model in models['models']:
        if model['date'] > config['LastModified']:
            if model['score'] > best_score:
                best_score = model['score']
                best_model = model['name']
    config['ActualModel'] = best_model
    config['ActualScore'] = best_score
    config['IsActual'] = True
    set_json(config, bucket, 'config.json')

def update_server():
    requests.post(f"http://http://87.239.106.15:8080/update")

In [360]:
bucket = 'LastFM-artist-recommender'

validate_storage(bucket)

check_for_updates(bucket)

process_data(bucket)

upload_processed_data(bucket)

load_data(bucket)

model1 = 'implicit.als.AlternatingLeastSquares(factors=64)'
train(bucket, model1)
evaluate(bucket, model1)

model2 = 'implicit.nearest_neighbours.CosineRecommender(K=200)'
train(bucket, model2)
evaluate(bucket, model2)

select_best_model(bucket)

update_server(bucket)

100%|██████████| 15/15 [00:07<00:00,  1.96it/s]
100%|██████████| 17493/17493 [00:00<00:00, 191303.86it/s]


In [361]:
bucket = 'LastFM-artist-recommender'

s3 = boto3.client(
    service_name='s3',
    endpoint_url='https://s3.storage.selcloud.ru',
    aws_access_key_id='236346_docker',
    aws_secret_access_key='112y;8U\K<Sh'
)
config = json.load(s3.get_object(Bucket=bucket, Key=f"config.json")['Body'])

In [366]:
config['ActualModel']

'implicit.als.AlternatingLeastSquares(factors=64)'

In [367]:
model = pickle.load(s3.get_object(Bucket=bucket, Key=f"models/{config['ActualModel']}.pkl")['Body'])