### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

from lightfm import LightFM, cross_validation
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k

from sklearn.model_selection import train_test_split

import pickle



### Functions for Others

In [2]:
def recommendation_evaluation(recommended_set, read_set):
    match_length = len(read_set.intersection(recommended_set))
    if match_length:
        return [True, len(read_set), match_length]
    else:
        return [False, len(read_set), match_length]

### Import Data

In [3]:
data = pd.read_csv('../data_cleaned/data_merged.csv', keep_default_na=False, na_values=['-', '', ' '])
manga_titles = pd.read_csv('../data_cleaned/manga_titles_cleaned.csv')
manga_titles.set_index('index', inplace=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
data['manga_length'] = data['manga_length'].astype(str)
data['clusters'] = data['clusters'].astype(str)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1434060 entries, 0 to 1434059
Data columns (total 100 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   index                      1434060 non-null  int64  
 1   manga_title                1434060 non-null  object 
 2   manga_link                 1434060 non-null  object 
 3   volumes                    1434060 non-null  object 
 4   chapters                   1434060 non-null  object 
 5   publication_status         1434060 non-null  object 
 6   published                  1434060 non-null  object 
 7   serialization              1434060 non-null  object 
 8   authors                    1434060 non-null  object 
 9   published_year_start       1434060 non-null  object 
 10  genres_Adventure           1434060 non-null  float64
 11  genres_Action              1434060 non-null  float64
 12  genres_Horror              1434060 non-null  float64
 13  genres_Dram

### Train Test Split Dataset

In [6]:
# Set the number of titles each member read
temp = data.groupby('member')['manga_title'].count()

# Filter away those member who read less than 5 books
X = data[data['member'].isin(temp[temp>=34].index)]
X_remain = data[data['member'].isin(temp[temp<34].index)]

# Split the dataset into train and test
data_train, data_test = train_test_split(X, stratify=X['member'], train_size=0.7, random_state=42)

# Adding X_remain back to X
data_train = pd.concat((data_train, X_remain))

In [7]:
# Shape of X_train
data_train.shape

(1362482, 100)

In [8]:
# Shape of X_test
data_test.shape

(71578, 100)

In [9]:
# % of dataset used for testing
data_test.shape[0]/data_train.shape[0]

0.05253500596705131

### Using LightFM

In [10]:
state_no = 42

In [11]:
# Create dictionary to track overall results
results={}

### Basic Model With warp as loss function

In [12]:
features_list = None
result_name = 'basic_model_warp'

# A dict in dict to store the results
results[result_name] = {}

# To create mapping
lfm_dataset = Dataset(user_identity_features=False, item_identity_features=True)

# Fit the dataset
lfm_dataset.fit(data_train['member'].unique(), data_train['index'].unique(), item_features=features_list)

# To create train_interaction and train_weights
temp = [(data_train.loc[i, 'member'], data_train.loc[i, 'index'], data_train.loc[i, 'score']) for i in data_train.index]
train_interactions, train_weights = lfm_dataset.build_interactions(temp)

# To create test_interaction and test_weights
temp = [(data_test.loc[i, 'member'], data_test.loc[i, 'index'], data_test.loc[i, 'score']) for i in data_test.index]
test_interactions, test_weights = lfm_dataset.build_interactions(temp)

# Create item features
item_features = features_list

In [13]:
no_of_threads = 6

In [14]:
# Train model using warp as loss function
model = LightFM(k=10, learning_rate=0.05, loss='warp', random_state=state_no)
model.fit(train_interactions, sample_weight=train_weights, epochs=50, num_threads=no_of_threads, verbose=True)
pickle.dump(model, open('../data_production/basic_model.pkl', 'wb'))

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [03:40<00:00,  4.41s/it]


In [15]:
# Calculate the precision of the model using the train dataset
train_precision = precision_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_precision
results[result_name]['train_precision@k'] = train_precision.mean()

# Calculate the precision of the model using the test dataset
test_precision = precision_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_precision@k'] = test_precision.mean()

# Calculate the recall of the model using the train dataset
train_recall = recall_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_recall
results[result_name]['train_recall@k'] = train_recall.mean()

# Calculate the precision of the model using the test dataset
test_recall = recall_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_recall@k'] = test_recall.mean()

# Calculate the auc score of the model using the train dataset
train_auc = auc_score(model, train_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the train_auc
results[result_name]['train_auc'] = train_auc.mean()

# Calculate the auc score of the model using the test dataset
test_auc = auc_score(model, test_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the test_auc
results[result_name]['test_auc'] = test_auc.mean()

In [16]:
results

{'basic_model_warp': {'train_precision@k': 0.13202193,
  'test_precision@k': 0.18290076,
  'train_recall@k': 0.7006574748139289,
  'test_recall@k': 0.11001033957753074,
  'train_auc': 0.9887221,
  'test_auc': 0.889645}}

In [17]:
# Evaluate model using train dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_train['member'].unique()[0:10000]
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_train['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    scoring_df = scoring_df.sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['train accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

100.0%																																

In [18]:
basic_train_lfm_result = lfm_result
basic_train_lfm_result.mean()

result_lfm        0.26660
no_titles_read    7.15720
no_match          0.45860
precision@k       0.04586
dtype: float64

In [19]:
# Apply grouping based on number of titles read
result = basic_train_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
groupings,Unnamed: 1_level_1,Unnamed: 2_level_1
heavy_readers,0.862069,87
light_reader,0.247671,9662
moderate_reader,0.788845,251


In [20]:
# Evaluate model using test dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_test['member'].unique()
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    train_read_list = data_train[data_train['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_test['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    unread_list = [i for i in scoring_df.index if i not in train_read_list]
    scoring_df = scoring_df.loc[unread_list,:].sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['test accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

100.0%																																

In [21]:
results

{'basic_model_warp': {'train_precision@k': 0.13202193,
  'test_precision@k': 0.18290076,
  'train_recall@k': 0.7006574748139289,
  'test_recall@k': 0.11001033957753074,
  'train_auc': 0.9887221,
  'test_auc': 0.889645,
  'train accuracy': 0.2666,
  'test accuracy': 0.7768447837150128}}

In [22]:
basic_test_lfm_result = lfm_result

In [23]:
basic_test_lfm_result.mean()

result_lfm         0.776845
no_titles_read    18.211705
no_match           1.828753
precision@k        0.182875
dtype: float64

In [24]:
# Apply grouping based on number of titles read
result = basic_test_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
groupings,Unnamed: 1_level_1,Unnamed: 2_level_1
heavy_readers,0.965517,87
light_reader,0.764198,3592
moderate_reader,0.89243,251


### Modeling with Feature Set 1

In [25]:
manga_details = pd.read_csv("../data_cleaned/manga_details_cleaned.csv")
genres_list = [col for col in manga_details.columns if 'genres' in col]
features_list = genres_list
result_name = 'feature1_model_warp'

# A dict in dict to store the results
results[result_name] = {}

# To create mapping
lfm_dataset = Dataset(user_identity_features=False, item_identity_features=True)

# Fit the dataset
lfm_dataset.fit(data_train['member'].unique(), data_train['index'].unique(), item_features=features_list)

# To create train_interaction and train_weights
temp = [(data_train.loc[i, 'member'], data_train.loc[i, 'index'], data_train.loc[i, 'score']) for i in data_train.index]
train_interactions, train_weights = lfm_dataset.build_interactions(temp)

# To create test_interaction and test_weights
temp = [(data_test.loc[i, 'member'], data_test.loc[i, 'index'], data_test.loc[i, 'score']) for i in data_test.index]
test_interactions, test_weights = lfm_dataset.build_interactions(temp)

# Create item features
manga_details.set_index('index', inplace=True)
manga_details = manga_details.loc[data_train['index'].unique()]
temp = (manga_details.loc[:, features_list].to_dict(orient='index'))
item_feature_scores = zip(temp.keys(), temp.values())
item_features = lfm_dataset.build_item_features(item_feature_scores)

In [26]:
no_of_threads = 6

In [27]:
# Train model using warp as loss function
model = LightFM(learning_rate=0.05, loss='warp', random_state=state_no)
model.fit(train_interactions, item_features=item_features, sample_weight=train_weights, epochs=50, num_threads=no_of_threads, verbose=True)
pickle.dump(model, open('../data_production/model1.pkl', 'wb'))

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [12:58<00:00, 15.58s/it]


In [28]:
# Calculate the precision of the model using the train dataset
train_precision = precision_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_precision
results[result_name]['train_precision@k'] = train_precision.mean()

# Calculate the precision of the model using the test dataset
test_precision = precision_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_precision@k'] = test_precision.mean()

# Calculate the recall of the model using the train dataset
train_recall = recall_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_recall
results[result_name]['train_recall@k'] = train_recall.mean()

# Calculate the precision of the model using the test dataset
test_recall = recall_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_recall@k'] = test_recall.mean()

# Calculate the auc score of the model using the train dataset
train_auc = auc_score(model, train_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the train_auc
results[result_name]['train_auc'] = train_auc.mean()

# Calculate the auc score of the model using the test dataset
test_auc = auc_score(model, test_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the test_auc
results[result_name]['test_auc'] = test_auc.mean()

In [29]:
results

{'basic_model_warp': {'train_precision@k': 0.13202193,
  'test_precision@k': 0.18290076,
  'train_recall@k': 0.7006574748139289,
  'test_recall@k': 0.11001033957753074,
  'train_auc': 0.9887221,
  'test_auc': 0.889645,
  'train accuracy': 0.2666,
  'test accuracy': 0.7768447837150128},
 'feature1_model_warp': {'train_precision@k': 0.10263066,
  'test_precision@k': 0.13417304,
  'train_recall@k': 0.5799568389921415,
  'test_recall@k': 0.0803872673839875,
  'train_auc': 0.98273766,
  'test_auc': 0.859975}}

In [30]:
# Evaluate model using train dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_train['member'].unique()[0:10000]
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_train['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    scoring_df = scoring_df.sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['train accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

100.0%																																

In [31]:
model1_train_lfm_result = lfm_result
model1_train_lfm_result.mean()

result_lfm        0.14410
no_titles_read    7.15720
no_match          0.18890
precision@k       0.01889
dtype: float64

In [32]:
# Apply grouping based on number of titles read
result = model1_train_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
groupings,Unnamed: 1_level_1,Unnamed: 2_level_1
heavy_readers,0.747126,87
light_reader,0.126268,9662
moderate_reader,0.621514,251


In [33]:
# Evaluate model using test dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_test['member'].unique()
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    train_read_list = data_train[data_train['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_test['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    unread_list = [i for i in scoring_df.index if i not in train_read_list]
    scoring_df = scoring_df.loc[unread_list,:].sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['test accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

100.0%																																

In [34]:
results

{'basic_model_warp': {'train_precision@k': 0.13202193,
  'test_precision@k': 0.18290076,
  'train_recall@k': 0.7006574748139289,
  'test_recall@k': 0.11001033957753074,
  'train_auc': 0.9887221,
  'test_auc': 0.889645,
  'train accuracy': 0.2666,
  'test accuracy': 0.7768447837150128},
 'feature1_model_warp': {'train_precision@k': 0.10263066,
  'test_precision@k': 0.13417304,
  'train_recall@k': 0.5799568389921415,
  'test_recall@k': 0.0803872673839875,
  'train_auc': 0.98273766,
  'test_auc': 0.859975,
  'train accuracy': 0.1441,
  'test accuracy': 0.39974554707379134}}

In [35]:
model1_test_lfm_result = lfm_result

In [36]:
# Apply grouping based on number of titles read
result = model1_test_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
groupings,Unnamed: 1_level_1,Unnamed: 2_level_1
heavy_readers,0.827586,87
light_reader,0.370267,3592
moderate_reader,0.673307,251


In [37]:
model1_test_lfm_result.mean()

result_lfm         0.399746
no_titles_read    18.211705
no_match           0.565903
precision@k        0.056590
dtype: float64

### Modeling with Feature Set 2

In [38]:
manga_details = pd.read_csv("../data_cleaned/manga_details_cleaned.csv")
themes_list = [col for col in manga_details.columns if 'themes' in col]
features_list = themes_list
result_name = 'feature2_model_warp'

# A dict in dict to store the results
results[result_name] = {}

# To create mapping
lfm_dataset = Dataset(user_identity_features=False, item_identity_features=True)

# Fit the dataset
lfm_dataset.fit(data_train['member'].unique(), data_train['index'].unique(), item_features=features_list)

# To create train_interaction and train_weights
temp = [(data_train.loc[i, 'member'], data_train.loc[i, 'index'], data_train.loc[i, 'score']) for i in data_train.index]
train_interactions, train_weights = lfm_dataset.build_interactions(temp)

# To create test_interaction and test_weights
temp = [(data_test.loc[i, 'member'], data_test.loc[i, 'index'], data_test.loc[i, 'score']) for i in data_test.index]
test_interactions, test_weights = lfm_dataset.build_interactions(temp)

# Create item features
manga_details.set_index('index', inplace=True)
manga_details = manga_details.loc[data_train['index'].unique()]
temp = (manga_details.loc[:, features_list].to_dict(orient='index'))
item_feature_scores = zip(temp.keys(), temp.values())
item_features = lfm_dataset.build_item_features(item_feature_scores)

In [39]:
no_of_threads = 6

In [None]:
# Train model using warp as loss function
model = LightFM(learning_rate=0.05, loss='warp', random_state=state_no)
model.fit(train_interactions, item_features=item_features, sample_weight=train_weights, epochs=50, num_threads=no_of_threads, verbose=True)
pickle.dump(model, open('../data_production/model2.pkl', 'wb'))

Epoch:  42%|███████████████████████████████▌                                           | 21/50 [18:07<22:54, 47.39s/it]

In [None]:
# Calculate the precision of the model using the train dataset
train_precision = precision_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_precision
results[result_name]['train_precision@k'] = train_precision.mean()

# Calculate the precision of the model using the test dataset
test_precision = precision_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_precision@k'] = test_precision.mean()

# Calculate the recall of the model using the train dataset
train_recall = recall_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_recall
results[result_name]['train_recall@k'] = train_recall.mean()

# Calculate the precision of the model using the test dataset
test_recall = recall_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_recall@k'] = test_recall.mean()

# Calculate the auc score of the model using the train dataset
train_auc = auc_score(model, train_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the train_auc
results[result_name]['train_auc'] = train_auc.mean()

# Calculate the auc score of the model using the test dataset
test_auc = auc_score(model, test_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the test_auc
results[result_name]['test_auc'] = test_auc.mean()

In [None]:
results

In [None]:
# Evaluate model using train dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_train['member'].unique()[0:10000]
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_train['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    scoring_df = scoring_df.sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['train accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

In [None]:
model2_train_lfm_result = lfm_result
model2_train_lfm_result.mean()

In [None]:
# Apply grouping based on number of titles read
result = basic_train_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

In [None]:
# Evaluate model using test dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_test['member'].unique()
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    train_read_list = data_train[data_train['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_test['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    unread_list = [i for i in scoring_df.index if i not in train_read_list]
    scoring_df = scoring_df.loc[unread_list,:].sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['test accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

In [None]:
results

In [None]:
model2_test_lfm_result = lfm_result

In [None]:
# Apply grouping based on number of titles read
result = model2_test_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

In [None]:
model2_test_lfm_result.mean()

### Modeling Using Feature Set 3

In [None]:
manga_details = pd.read_csv("../data_cleaned/manga_details_cleaned.csv")
demographic_list = [col for col in manga_details.columns if 'demographic' in col]
features_list = demographic_list
result_name = 'feature3_model_warp'

# A dict in dict to store the results
results[result_name] = {}

# To create mapping
lfm_dataset = Dataset(user_identity_features=False, item_identity_features=True)

# Fit the dataset
lfm_dataset.fit(data_train['member'].unique(), data_train['index'].unique(), item_features=features_list)

# To create train_interaction and train_weights
temp = [(data_train.loc[i, 'member'], data_train.loc[i, 'index'], data_train.loc[i, 'score']) for i in data_train.index]
train_interactions, train_weights = lfm_dataset.build_interactions(temp)

# To create test_interaction and test_weights
temp = [(data_test.loc[i, 'member'], data_test.loc[i, 'index'], data_test.loc[i, 'score']) for i in data_test.index]
test_interactions, test_weights = lfm_dataset.build_interactions(temp)

# Create item features
manga_details.set_index('index', inplace=True)
manga_details = manga_details.loc[data_train['index'].unique()]
temp = (manga_details.loc[:, features_list].to_dict(orient='index'))
item_feature_scores = zip(temp.keys(), temp.values())
item_features = lfm_dataset.build_item_features(item_feature_scores)

In [None]:
no_of_threads = 6

In [None]:
# Train model using warp as loss function
model = LightFM(learning_rate=0.05, loss='warp', random_state=state_no)
model.fit(train_interactions, item_features=item_features, sample_weight=train_weights, epochs=50, num_threads=no_of_threads, verbose=True)
pickle.dump(model, open('../data_production/model3.pkl', 'wb'))

In [None]:
# Calculate the precision of the model using the train dataset
train_precision = precision_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_precision
results[result_name]['train_precision@k'] = train_precision.mean()

# Calculate the precision of the model using the test dataset
test_precision = precision_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_precision@k'] = test_precision.mean()

# Calculate the recall of the model using the train dataset
train_recall = recall_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_recall
results[result_name]['train_recall@k'] = train_recall.mean()

# Calculate the precision of the model using the test dataset
test_recall = recall_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_recall@k'] = test_recall.mean()

# Calculate the auc score of the model using the train dataset
train_auc = auc_score(model, train_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the train_auc
results[result_name]['train_auc'] = train_auc.mean()

# Calculate the auc score of the model using the test dataset
test_auc = auc_score(model, test_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the test_auc
results[result_name]['test_auc'] = test_auc.mean()

In [None]:
results

In [None]:
# Evaluate model using train dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_train['member'].unique()[0:10000]
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_train['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    scoring_df = scoring_df.sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['train accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

In [None]:
model3_train_lfm_result = lfm_result
model3_train_lfm_result.mean()

In [None]:
# Apply grouping based on number of titles read
result = basic_train_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

In [None]:
# Evaluate model using test dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_test['member'].unique()
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    train_read_list = data_train[data_train['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_test['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    unread_list = [i for i in scoring_df.index if i not in train_read_list]
    scoring_df = scoring_df.loc[unread_list,:].sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['test accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

In [None]:
results

In [None]:
model3_test_lfm_result = lfm_result

In [None]:
model3_test_lfm_result.mean()

### Modeling Using Feature Set 4

In [None]:
manga_details = pd.read_csv("../data_cleaned/manga_details_cleaned.csv")
features_list = genres_list + themes_list + demographic_list
result_name = 'feature4_model_warp'

# A dict in dict to store the results
results[result_name] = {}

# To create mapping
lfm_dataset = Dataset(user_identity_features=False, item_identity_features=True)

# Fit the dataset
lfm_dataset.fit(data_train['member'].unique(), data_train['index'].unique(), item_features=features_list)

# To create train_interaction and train_weights
temp = [(data_train.loc[i, 'member'], data_train.loc[i, 'index'], data_train.loc[i, 'score']) for i in data_train.index]
train_interactions, train_weights = lfm_dataset.build_interactions(temp)

# To create test_interaction and test_weights
temp = [(data_test.loc[i, 'member'], data_test.loc[i, 'index'], data_test.loc[i, 'score']) for i in data_test.index]
test_interactions, test_weights = lfm_dataset.build_interactions(temp)

# Create item features
manga_details.set_index('index', inplace=True)
manga_details = manga_details.loc[data_train['index'].unique()]
temp = (manga_details.loc[:, features_list].to_dict(orient='index'))
item_feature_scores = zip(temp.keys(), temp.values())
item_features = lfm_dataset.build_item_features(item_feature_scores)

In [None]:
no_of_threads = 6

In [None]:
# Train model using warp as loss function
model = LightFM(learning_rate=0.05, loss='warp', random_state=state_no)
model.fit(train_interactions, item_features=item_features, sample_weight=train_weights, epochs=50, num_threads=no_of_threads, verbose=True)
pickle.dump(model, open('../data_production/model4.pkl', 'wb'))

In [None]:
# Calculate the precision of the model using the train dataset
train_precision = precision_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_precision
results[result_name]['train_precision@k'] = train_precision.mean()

# Calculate the precision of the model using the test dataset
test_precision = precision_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_precision@k'] = test_precision.mean()

# Calculate the recall of the model using the train dataset
train_recall = recall_at_k(model, train_interactions, item_features=item_features, k=10, num_threads=no_of_threads)
# Calculate the overall mean of the train_recall
results[result_name]['train_recall@k'] = train_recall.mean()

# Calculate the precision of the model using the test dataset
test_recall = recall_at_k(model, test_interactions, train_interactions=train_interactions, item_features=item_features, k=10, check_intersections=False, num_threads=no_of_threads)
# Calculate the overall mean of the test_precision
results[result_name]['test_recall@k'] = test_recall.mean()

# Calculate the auc score of the model using the train dataset
train_auc = auc_score(model, train_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the train_auc
results[result_name]['train_auc'] = train_auc.mean()

# Calculate the auc score of the model using the test dataset
test_auc = auc_score(model, test_interactions, item_features=item_features, num_threads=no_of_threads)
# Calculate the overall mean of the test_auc
results[result_name]['test_auc'] = test_auc.mean()

In [None]:
results

In [None]:
# Evaluate model using train dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_train['member'].unique()[0:10000]
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_train['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    scoring_df = scoring_df.sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['train accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

In [None]:
model4_train_lfm_result = lfm_result
model4_train_lfm_result.mean()

In [None]:
# Apply grouping based on number of titles read
result = basic_train_lfm_result
result['groupings'] = result['no_titles_read'].apply(lambda x: 'heavy_readers' if x > 50 else 'moderate_reader' if x > 30 else 'light_reader')

# Calculate the mean Accuracy of Each Groups
result.groupby('groupings')['result_lfm'].agg(['mean', 'count'])

In [None]:
# Evaluate model using test dataset
evaluation_result = {}
count = 0
top_k = 10

member_list = data_test['member'].unique()
for user in member_list:
    read_list = data_test[data_test['member']==user]['index'].values
    train_read_list = data_train[data_train['member']==user]['index'].values
    
    user = lfm_dataset.mapping()[0][user]
    scoring_df = pd.DataFrame(model.predict(user_ids=user, item_ids=np.arange(data_test['index'].nunique())))
    scoring_df = scoring_df.merge(pd.DataFrame(lfm_dataset.mapping()[2].keys()), left_index=True, right_index=True)
    scoring_df.columns = ['score', 'title']
    scoring_df.set_index('title', inplace=True)
    scoring_df.columns = ['score']
    unread_list = [i for i in scoring_df.index if i not in train_read_list]
    scoring_df = scoring_df.loc[unread_list,:].sort_values('score', ascending=False)
    recommendation_list = scoring_df.index[0:top_k]
    evaluation_result[user] = recommendation_evaluation(recommendation_list, set(read_list))
    count+=1
    print("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", end='\r')
    print(f"{count/member_list.shape[0]*100}%", end='\r')
    
# convert evaluation_result into dataframe and calculate precision@k
lfm_result=pd.DataFrame.from_dict(evaluation_result, orient='index', columns=['result_lfm', 'no_titles_read', 'no_match'])
lfm_result['precision@k']=lfm_result['no_match']/top_k

# Average accuracy of basic similarity matrix
results[result_name]['test accuracy'] = lfm_result['result_lfm'].sum()/lfm_result.shape[0]

In [None]:
results

In [None]:
model4_test_lfm_result = lfm_result

In [None]:
model4_test_lfm_result.mean()

In [None]:
# Comparing the results of the models, all 5 models did not fare as well as the basic similarity matrix and the item based collaborative filtering.
# Accuracy for BSM on test data set: 0.344784
# Accuracy for item based CF similarity matrix on test data set: 0.676590
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df

### Conclusion for LightFM

When using LightFM, 5 different models were tried with various features. However, despite trying various features, the accuracy of the basic model, without any feature, performed the best out of the 5 models. This could be due to the number of embedding is too small to learn the features hence resulted in poorer accuracy. However due to time contraint to present the benefits of a recommender system to the client,  as well as the best performing model did not outperform the bsm-cfsm hybrid model, this model is not used due to the time needed to train and optimize the model.