In [1]:
!pip install lightfm
import numpy as np
import pandas as pd
from lightfm import LightFM
import itertools
from lightfm.evaluation import precision_at_k
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix
from time import time
from lightfm.cross_validation import random_train_test_split


Defaulting to user installation because normal site-packages is not writeable


In [2]:
def create_interactions(df):
    chunk_size = 5000
    chunks = [x for x in range(0, df.shape[0], chunk_size)]
    return pd.concat([df.iloc[ chunks[i]:chunks[i + 1] - 1 ].pivot_table(index='userId', columns='movieId', values='rating',fill_value = 0) for i in range(0, len(chunks) - 1)])

#     return pd.pivot_table(df, index='userId', columns='movieId', values='rating', fill_value = 0)

In [3]:
def create_matrix(interaction):
    
    total_matrix = csr_matrix(interaction.values)
    
    (train_matrix, val_and_test_matrix) = random_train_test_split(total_matrix, test_percentage=0.8)
    
    (val_matrix, test_matrix) = random_train_test_split(val_and_test_matrix, test_percentage=0.5)
    
    return train_matrix, val_matrix, test_matrix

In [4]:

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """
    
    return {"no_components": [5,10,20,40,80,160], "learning_rate": [.01,.1,1,2]}

In [5]:
def random_search(train, val, m_iter):
    
    MAP_dict = {}

    hyperparams = sample_hyperparameters()
    
    for rank in hyperparams['no_components']:
            
        key = '{} Rank'.format(rank)
        nested_dict = {}
            
        for reg in hyperparams['learning_rate']:

            model = LightFM(random_state = 123, learning_rate = reg, no_components = rank)
            model.fit(train, epochs=m_iter)

            MAP = precision_at_k(model, val, k = 500).mean()
                
            nested_key = '{} Reg Param'.format(reg)
                
            nested_dict[nested_key] = MAP
        
        MAP_dict[key] = nested_dict
                
    return MAP_dict


In [6]:
def get_best_params(MAP_dict):
    
    max_MAP = 0
    dic = MAP_dict
    for rank in dic.keys():
        for reg in dic[rank]:
            if dic[rank][reg] > max_MAP:
                max_rank = rank
                max_reg = reg
                
    return float(str.split(max_reg, ' ')[0]), int(str.split(max_rank, ' ')[0])
    

In [7]:
def train_model(train_matrix, rank, reg, m_iter):
    
    model = LightFM(random_state = 123, learning_rate = reg, no_components = rank)
    model = model.fit(train_matrix, epochs = m_iter, num_threads=48)
    
    return model

In [8]:
def test_model(model, test_matrix):
    
    return precision_at_k(model, test_matrix).mean()

In [9]:
def main():
    train_csv = "part-00000-91f527f7-fd90-45c5-914f-b322fb19f068-c000.csv"
    valid_csv = "part-00000-b5285b31-2087-4107-a403-ec29338caec5-c000.csv"
    test_csv = "part-00000-c9a2728c-5747-41b5-bd7f-398e06a5b3ec-c000.csv"
    train_df = pd.read_csv('../data/train_df_repartcsv/{}'.format(train_csv))
    test_df = pd.read_csv('../data/test_df_repartcsv/{}'.format(test_csv))
    val_df = pd.read_csv('../data/valid_df_repartcsv/{}'.format(valid_csv))
        
    df_total = pd.concat([train_df, val_df, test_df])
    print(len(df_total.index))

    df_interaction = create_interactions(df_total)
    print('pivot table created')
    
    train_matrix, val_matrix, test_matrix = create_matrix(df_interaction)
    
    print('matrices created')
    
    st = time()
    MAP_dict = random_search(train_matrix, val_matrix, m_iter = 4)
    end = round(time()-st, 3)
    
    print("Hyperparameter tuning took {} seconds".format(end))
    
    best_reg, best_rank = get_best_params(MAP_dict)
    
    print("Best rank: {}, best reg: {}".format(best_rank, best_reg))

    st = time()
    model = train_model(train_matrix, best_rank, best_reg, m_iter = 4)
    MAP = test_model(model, test_matrix)  
    end = round(time()-st, 3)
    
    print("MAP on test data: {}".format(MAP))
    print("Final model training and fitting took {}".format(end))
    
    return pd.DataFrame(MAP_dict)
    


In [None]:
main()

3471442
