In [64]:
!pip install lightfm
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [65]:
import pickle
import implicit
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from lightfm import LightFM
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score

## Load data

In [66]:
df = pd.read_csv('medlib_dataset.csv')
name_books = pd.read_csv('books.csv')

In [67]:
name_books.fillna('', inplace=True)
name_books = name_books[name_books.id_book != '']
name_books = name_books[name_books.name_book != '']
name_books.name_book = name_books.name_book.apply(lambda x: x.lower())

## Helper functions for clustering

In [68]:
# Helper function for all
def add_categories(df):
    item_user = df[['id', 'id_book']]
    
    with pd.option_context('mode.chained_assignment', None):
        item_user.id = item_user.id.astype('category')
        item_user.id_book = item_user.id_book.astype('category')
    
        item_user['category_id'] = item_user.id.cat.codes
        item_user['category_book'] = item_user.id_book.cat.codes
    
    return item_user

In [69]:
def create_ds_for_clustering(df, semestr):
    '''
    One-Hot Encoding vector for each user
    '''
    df_clustering = df[df.semestr.isin(semestr)][['category_id', 
                                                  'category_book']]
    df_clustering = df_clustering.drop_duplicates()    
    # add ohe vector for user
    X_train = pd.merge(
        df_clustering, pd.get_dummies(df_clustering['category_book'], 
                                      prefix='book'),
        left_index=True, right_index=True, 
        how='inner').drop_duplicates().drop('category_book', axis=1).groupby('category_id').sum().sort_index()
    
    return X_train

In [70]:
def find_best_k(X_train):
    '''
    Training K-Means with several clusters
    Returns best silhouette coefficient
    '''
    metrics = {}
    for i in range(3, 8):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
        kmeans.fit(X_train)

        cluster_labels = kmeans.fit_predict(X_train)
        silhouette_avg = silhouette_score(X_train, cluster_labels)
        
        metrics[i] = silhouette_avg

    best_k = max(metrics, key=metrics.get)
    return best_k

In [71]:
def fit_kmeans(X_train, clusters):
    '''
    K-Means training
    '''
    kmeans = KMeans(n_clusters=clusters,
                    init='k-means++',
                    random_state=42)
    
    y_km = kmeans.fit(X_train)
    return y_km

In [72]:
def create_kmean_df(y_km, X_train):
    '''
    Creates dataframe that looks like this: 
        |_category_id_|_cluster_|
        |     102     |    0    |
    '''
    clusters = y_km.predict(X_train)

    labels = pd.DataFrame(clusters)
    users = pd.DataFrame(X_train.index.unique())

    kmeans_df = pd.concat((users, labels), axis=1)
    kmeans_df.rename({0: 'cluster'}, axis=1, inplace=True)
    
    return kmeans_df

In [73]:
def hit_rate_kmean(kmeans_df, df, semestr, topn:int=10):
    '''
    Hit Rate for 1 specific book
    '''
    users = kmeans_df.category_id.unique()
    score = 0

    for user in users:
        cluster = kmeans_df[kmeans_df.category_id == user]['cluster'].unique()[0]
        similar_users = kmeans_df[kmeans_df.cluster == cluster]['category_id'].unique()

        rec_books = df[(df.category_id.isin(similar_users)) & (df.semestr == semestr) & (
            df.category_id != user)]['id_book'].value_counts()[:topn].index.tolist()

        true_book = df[(df.category_id == user) & (df.semestr == semestr)].sort_values(by='date')['id_book'].values[0]
        if true_book in rec_books:
            score += 1

    hit_rate = score / users.shape[0]
    return hit_rate

In [74]:
def test_clustering(df, train_sem: list, test_sem: int):    
    # Clustering data
    X_train = create_ds_for_clustering(df, train_sem)
    
    # Optimal number of clusters
    clusters = find_best_k(X_train)
    
    # Train kmeans with best k
    y_km = fit_kmeans(X_train, clusters)
    
    # Users from n semester
    users_n_semester = df[df.semestr == test_sem]['category_id'].unique()
    
    # One-Hot Encoding for users from n semester
    X_test = X_train[X_train.index.isin(users_n_semester)]
    
    # Df with users and their clusters
    kmean_df = create_kmean_df(y_km, X_test)
    dct = {}
    dct['semestr'] = test_sem
    print(f'\t\tCluster model Metrics for {test_sem} semester with {clusters} clusters')

    for n in [1, 5, 10]:
        hr = 100 * round(hit_rate_kmean(kmean_df, df, test_sem, topn=n), 2)
        dct[f'HitRate@{n}'] = hr
        print(f'\tHit Rate@{n}:\t\t{hr}%')

    print('\n\n')
    dct['clusters'] = clusters
    return dct

## Helper function for ALS

In [75]:
def csr_row_set_nz_to_val(csr, row, value=0):
    if not isinstance(csr, csr_matrix):
        raise ValueError('Matrix given must be of CSR format.')
        
    csr.data[csr.indptr[row]:csr.indptr[row+1]] = value

In [76]:
def create_csr_matrix(matrix, users, items):
    '''
    Creating CSR Matrix
    '''
    n_users = users.max() + 1
    n_items = items.max() + 1

    matrix_shape = (n_users, n_items)
    data = np.ones(items.shape)
    
    user_item_matrix = csr_matrix((data, (users, items)), shape=matrix_shape)
    return user_item_matrix

In [77]:
def normalization_matrix(user_item_matrix):
    '''
    Normalize matrix for user
    '''
    reader_normalized_matrix = normalize(user_item_matrix, axis=1, norm='l1')
    book_normalized_matrix = normalize(user_item_matrix, axis=0, norm='l1')

    lazy_readers = np.array(user_item_matrix.sum(axis=1)).squeeze()
    lazy_readers = np.where(lazy_readers < 2)[0]    

    for rId in lazy_readers:
        csr_row_set_nz_to_val(reader_normalized_matrix, rId)

    return reader_normalized_matrix

In [78]:
def normalize_matrix(train, val=None):
    '''
    Split matrix to train/val
    '''
    users = train.category_id
    items = train.category_book

    csr_train = create_csr_matrix(train, users, items)
    csr_val = create_csr_matrix(val, users, items)

    normalization_train = normalization_matrix(csr_train)
    normalization_val = normalization_matrix(csr_val)
    
    return normalization_train, normalization_val

In [79]:
def hit_rate_als(model, df, train, test, normalization_train, items_max, n, test_sem=0, verbose=True):
    '''
    Hit Rate for 1 specific book
    '''
    users = test.category_id.unique()
    score = 0
    
    for user in users: 
      indexs = model.recommend(int(user), normalization_train, filter_already_liked_items=False)[0][:n].tolist()
      if test_sem != 0:
        actual = test[(test.category_id == user) & (test.semestr == test_sem)]['id_book'].values.tolist()
      else:
        actual = test[test.category_id == user]['id_book'].values.tolist()

      preds = df[df.category_book.isin(indexs)]['id_book'].unique().tolist()

      hit = len(set(actual) & set(preds))
      if hit > 0:
        score += 1
      hite_rate = score / users.shape[0]
      
    return hite_rate

In [80]:
def train_validation(train):
    val_shape = int(train.id.unique().shape[0] * (15 / 100))
    not_choose = train.groupby(
        'id')['id_book'].count()[train.groupby(
            'id')['id_book'].count() == 1].index.tolist()

    val_users = np.random.choice(train[~train.id.isin(not_choose)]['id'].unique(), val_shape)

    val = train[train.id.isin(val_users)].sort_values(by=['id', 'date']).groupby('id').last().reset_index()
    new_train = train[~train.apply(tuple,1).isin(val.apply(tuple,1))]

    return new_train, val

In [81]:
def tuning_als(df, normalization_train, train, X_val, items):
    lst = []
    for factor in [40]:
        for iters in [40]:
            for regular in [0.05]:
                LogMatFac = implicit.als.AlternatingLeastSquares(
                    factors=factor,
                    regularization=regular,
                    iterations=iters,
                    random_state=42,
                    num_threads=4
                )

                LogMatFac.fit(normalization_train, show_progress=False)
                scores = []

                for n in [1, 5, 10]:
                  scores.append(hit_rate_als(LogMatFac,
                                              df,
                                              train,
                                              X_val,
                                              normalization_train,
                                              items,
                                              n=n,
                                              verbose=False
                                              )
                                 )
                dct = {}
                dct['factors'] = factor
                dct['regularization'] = regular
                dct['iterations'] = iters
                dct['score1'] = scores[0]
                dct['score5'] = scores[1]
                dct['score10'] = scores[2]
                dct['model'] = LogMatFac
                lst.append(dct)
                
    best_params = max(lst, key=lambda x: x['score5'])
    
    return best_params['model']

In [82]:
def test_als(df, train_sem: list, test_sem: int):    
    train = df[df.semestr.isin(train_sem)]
    train, validation = train_validation(train)
    normalization_train, val_ = normalize_matrix(train=train, val=validation)

    X_test = df[df.semestr == test_sem]
    items = 1748

    model = tuning_als(df, normalization_train, train, validation, items)
    
    dct = {}
    dct['semestr'] = test_sem
    
    print(f'\n\t\tALS model Metrics for {test_sem} semester\n') 

    for n in [1, 5, 10]:
        hr = 100 * round(hit_rate_als(model, df, train, X_test, normalization_train, items, n, test_sem), 3)
        dct[f'HitRate@{n}'] = hr
        print(f'\tHit Rate@{n}:\t\t{hr:.2f}%')

    print('\n\n')
    return dct

## Helper function for LightFM

In [83]:
def hit_rate_lightfm(model, train, test, items_max, test_sem=0, n:int=10, verbose=True):
    users = test.category_id.unique()
    score = 0
    for user in users:
      indexs = model.predict(int(user), np.arange(items_max)).argsort()[::-1]
      already_read = train[(train.category_id == int(user)) & (
          train.category_book.isin(indexs))]['category_book'].tolist()
      for ar in already_read:
        indexs = indexs[indexs != ar]
      indexs = indexs[:n]
      
      if test_sem != 0:
        actual = test[(test.category_id == user) & (test.semestr == test_sem)]['id_book'].values
      else:
        actual = test[test.category_id == user]['id_book'].values
        
      preds = train[train.category_book.isin(indexs)]['id_book'].unique().tolist()
      
      hit = len(set(actual) & set(preds))
      if hit > 0:
        score += 1
    hit_rate = score / users.shape[0]
    
    return hit_rate

In [84]:
def sample_hyperparameters():
    while True:
        yield {
            "no_components": np.random.choice([20, 40, 60], 1)[0],
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos", 'logistic']),
            "learning_rate": 0.05,
            "max_sampled": np.random.choice([5, 10, 15], 1)[0],
            "num_epochs": np.random.choice([20, 40, 60], 1)[0],
        }

In [85]:
def random_search(normalization_train, train, val, items_max, num_samples, num_threads=8):
  for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
    num_epochs = hyperparams.pop("num_epochs")

    model = LightFM(**hyperparams)
    model.fit(normalization_train, epochs=num_epochs, num_threads=num_threads)
    
    scores = []
    for n in [1, 5, 10]:
      hr = hit_rate_lightfm(model, train, val, items_max, n=n)
      scores.append(hr)
    
    hyperparams["num_epochs"] = num_epochs
    
    yield (scores, hyperparams, model)

In [86]:
def test_lightfm(df, train_sem: list, test_sem: int):    
    train = df[df.semestr.isin(train_sem)]
    
    train, validation = train_validation(train)

    normalization_train, normalization_val = normalize_matrix(train=train, 
                                                              val=validation)
    
    X_test = df[df.semestr == test_sem]
    
    items = 1748
    
    (scores, hyperparams, model) = max(random_search(
        normalization_train,
        train, 
        validation,
        items,
        num_samples=10),
        key=lambda x: x[0])
    
    dct = {}
    dct['semestr'] = test_sem
    print(f'\n\t\tLightFM model Metrics for {test_sem} semester\n') 

    for n in [1, 5, 10]:
        hr = 100 * round(hit_rate_lightfm(model, train, X_test, items, test_sem, n), 3)
        dct[f'HitRate@{n}'] = hr
        print(f'\tHit Rate@{n}:\t\t{hr:.2f}%')

    print('\n\n')
    return dct

## Function test

In [87]:
categories = add_categories(df)

df = pd.merge(df, categories.drop_duplicates(), left_on=['id', 'id_book'], right_on=['id', 'id_book'], how='inner').drop_duplicates()

In [None]:
%%time
lst = [1]
report_cluster = []
report_als = []
report_lfm = []

for n in range(2, 13):
    print(f'semester: {n}')
    dct2 = test_als(df, lst, n)
    report_als.append(dct2)
    
    dct1 = test_clustering(df, lst, n)
    report_cluster.append(dct1)

    dct3 = test_lightfm(df, lst, n)
    report_lfm.append(dct3)
    lst.append(n)

semester: 2

		ALS model Metrics for 2 semester

	Hit Rate@1:		6.30%
	Hit Rate@5:		23.20%
	Hit Rate@10:		28.70%



		Cluster model Metrics for 2 semester with 7 clusters
	Hit Rate@1:		10.0%
	Hit Rate@5:		37.0%
	Hit Rate@10:		55.00000000000001%




		LightFM model Metrics for 2 semester

	Hit Rate@1:		2.90%
	Hit Rate@5:		13.90%
	Hit Rate@10:		24.40%



semester: 3
