In [304]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
from sklearn.metrics import roc_auc_score, f1_score, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from catboost import CatBoostClassifier
from pandas_profiling import ProfileReport

%matplotlib inline

### Data loading

In [305]:
train = pd.read_csv('train.csv', low_memory=False)
test = pd.read_csv('test.csv', low_memory=False)
submission = pd.read_csv('sample_submission.csv', low_memory=False)

In [306]:
train_data, test_data = train_test_split(train, test_size=0.2, shuffle=True, random_state=42)

In [307]:
# train_data = train
# test_data = test

### Simple features-free modeling with LightFM

In [308]:
ratings_coo = sparse.coo_matrix((train_data.rating.astype(int), 
                                 (train_data.userid, train_data.itemid)))

In [309]:
# lr_list = [0.11, 0.12, 0.13, 0.15, 0.17, 0.2, 0.25, 0.3]
# comp_list = [10, 15, 20, 25, 30, 50, 100, 150, 300]

lr_list = [0.17]
comp_list = [105]

score_list = []
params = []

In [310]:
for lr in tqdm(lr_list):
    for comp in comp_list:
        for ep in [11]:
            model = LightFM(learning_rate=lr, loss='logistic', no_components=comp, 
                            learning_schedule='adagrad')
            model = model.fit(ratings_coo, epochs=ep, num_threads=16)
            preds = model.predict(test_data.userid.values, test_data.itemid.values)
            preds_norm = ((preds-preds.min()) / (preds-preds.min()).max())
            roc_auc = roc_auc_score(test_data.rating.values, preds_norm)
            params.append([lr, comp])
            score_list.append(roc_auc)
            print(f'epochs = {ep}')
            print(f'lr = {lr}, comp = {comp}')
            print(f'roc_auc_score = {roc_auc}')
            print('######################################################')
        
max_idx = np.argmax(np.array(score_list))
print()
print(f'epoch = {ep}')
print(score_list[max_idx])
print(params[max_idx])

100%|██████████| 1/1 [00:24<00:00, 24.18s/it]

epochs = 11
lr = 0.17, comp = 105
roc_auc_score = 0.7586849820085277
######################################################

epoch = 11
0.7586849820085277
[0.17, 105]





### Data analysis and cleaning

In [312]:
with open('meta_Grocery_and_Gourmet_Food.json', 'rb') as file:
    meta_list = []
    for line in file.readlines():
        meta_list.append(json.loads(line))

In [320]:
meta = pd.DataFrame(meta_list)

In [321]:
train_full = pd.merge(train, meta, on='asin')

In [322]:
columns_to_drop = [
    'reviewTime',
    'reviewerName',
    'style',
    'image_x',
    'image_y',
    'date',
    'feature',
    'details',
    'tech1',
    'fit', 
    'similar_item', 
    'vote']

In [323]:
def clean_data(input_data):
    data = input_data.drop(columns_to_drop, axis=1)
    data.summary.fillna('Nothing', inplace=True)
    data.brand.fillna('No brand', inplace=True)
    data.also_view.fillna(0.0, inplace = True)
    data.also_buy.fillna(0.0, inplace = True)
    data.description.fillna('No description', inplace=True)
    data['rank'].fillna('0,0', inplace=True)
    data['reviewText'].fillna('No review', inplace=True)
    data['main_cat'].fillna('No cat', inplace=True)
    
    cat_df = pd.DataFrame(data.category.values.tolist(), 
                          index=data.index, dtype=str).add_prefix('category_')
    cat_df.fillna('No category', inplace=True)
    data = pd.concat([data, cat_df.iloc[:, :4]], axis=1)
    
    data['description'] = data.description.map(lambda x: ' '.join(x))
    data['price'] = data.price.str.findall(r'\d+\.\d+').apply(lambda x: 
                                                              x if type(x)!=list else x[0]).astype('float64')
    data.price.fillna(data.groupby('itemid').price.count())
    data['also_view'] = data.also_view.apply(lambda x: 0 if type(x)!=list else len(x))
    data['also_buy'] = data.also_buy.apply(lambda x: 0 if type(x)!=list else len(x))
    data['verified'] = data.verified.map({True:1, False:0})
    data['rank'] = data['rank'].str.findall('\d+,+\d+').map(lambda x: float(x[0].replace(',', '.')) 
                                                            if ((type(x)==list)
                                                            and (len(x)!=0)) else 0.0)
    data.drop('category', axis=1, inplace=True)
    data['price'].fillna(0.0, inplace=True)
    
    return data

In [324]:
# profile = ProfileReport(meta, title='Pandas Profiling Report', explorative=True)

In [428]:
sample = train_full.sample(frac=1)

In [326]:
# sns.heatmap(sample.isna(), cmap='plasma')

### Data splitting

In [429]:
data_sample = clean_data(sample)

In [430]:
data_sample.columns

Index(['overall', 'verified', 'asin', 'reviewText', 'summary',
       'unixReviewTime', 'userid', 'itemid', 'rating', 'description', 'title',
       'brand', 'rank', 'also_view', 'main_cat', 'price', 'also_buy',
       'category_0', 'category_1', 'category_2', 'category_3'],
      dtype='object')

In [431]:
num_features = ['unixReviewTime', 'price', 'rank', 'also_view', 'also_buy',]
cat_features = ['verified', 'asin', 'userid', 'itemid',]# 'brand', 'main_cat', 
#                'category_0', 'category_1', 'category_2', 'category_3']
text_features = ['reviewText', 'summary', 'title',]# 'description']

target_features = ['overall', 'rating']

In [432]:
X, y = data_sample.drop(target_features, axis=1), data_sample.rating

In [433]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, shuffle=True, random_state=42)

### Custom "mean" encoding

In [434]:
def make_rating_mean_count(df_mean_train, df_mean_test, col, df_train, df_test, group_cols=None):
    
    # функция добавляет в df_mean_train и df_mean_test столбцы средней цены/количества объявлений
    # для каждого категориального признака col. group_cols - лист с дополнительными признаками,
    # которые позволяют группировать рейтинг/количества по связке "признаки group_cols + признак col"
    
    join_cols = []
    
    if (group_cols!=None) and (col not in group_cols):
        join_cols.append(col) 
        join_cols += group_cols
    else:
        join_cols.append(col)
    df_mean_train[col+'_rating_mean'] = df_train[join_cols].merge(df_train.groupby(by=join_cols, as_index=False).
                                                                 rating.median(), on=join_cols, how='left').iloc[:,-1]
    df_mean_test[col+'_rating_mean'] = df_test[join_cols].merge(df_train.groupby(by=join_cols, as_index=False).
                                                               rating.median(), on=join_cols, how='left').iloc[:,-1]
    df_mean_train[col+'_count'] = df_train[join_cols].merge(df_train.groupby(by=join_cols, as_index=False).
                                                            rating.count(), on=join_cols, how='left').iloc[:,-1]
    df_mean_test[col+'_count'] = df_test[join_cols].merge(df_train.groupby(by=join_cols, as_index=False).
                                                          rating.count(), on=join_cols, how='left').iloc[:,-1]   
    
    if col == 'model': 
        df_mean_test[col+'_rating_mean'] = df_mean_test[col+'_rating_mean'].\
        fillna(df_mean_test['brand_rating_mean']) 
        
        df_mean_test[col+'_count'] = df_mean_test[col+'_count'].\
        fillna(df_mean_test['brand_count'])
    
    if col == 'model_name':
        df_mean_test[col+'_rating_mean'] = df_mean_test[col+'_rating_mean'].\
        fillna(df_mean_test['model_rating_mean']).\
        fillna(df_mean_test['brand_rating_mean'])    
        
        df_mean_test[col+'_count'] = df_mean_test[col+'_count'].\
        fillna(df_mean_test['model_count']).\
        fillna(df_mean_test['brand_count'])  
    
    if col == 'modification':
        df_mean_test[col+'_rating_mean'] = df_mean_test[col+'_rating_mean'].\
        fillna(df_mean_test['model_name_rating_mean']).fillna(df_mean_test['model_rating_mean']).\
        fillna(df_mean_test['brand_rating_mean'])    
        
        df_mean_test[col+'_count'] = df_mean_test[col+'_count'].\
        fillna(df_mean_test['model_name_count']).fillna(df_mean_test['model_count']).\
        fillna(df_mean_test['brand_count'])
        
        
    other_filler_rating_mean = df_mean_test[f'{col}_rating_mean'].median()
    df_mean_test[f'{col}_rating_mean'] = df_mean_test[f'{col}_rating_mean'].fillna(other_filler_rating_mean)

    other_filler_count = df_mean_test[f'{col}_count'].median()
    df_mean_test[f'{col}_count'] = df_mean_test[f'{col}_count'].fillna(other_filler_count)



def make_num_feat_mean(df_mean_train, df_mean_test, col, num_col, df_train, df_test, group_cols=None):
    
    # аналогичная функция, но кодирует категориальные признаки по средним значениям числовых колонок (num_features)
    
    if col in ['itemid', 'userid']:
        join_cols = []

        if (group_cols!=None) and (col not in group_cols):
            join_cols.append(col) 
            join_cols += group_cols
        else:
            join_cols.append(col)

        df_mean_train[f'{col}_{num_col}_mean'] = df_train[join_cols].merge(df_train.groupby(by=join_cols, as_index=False)\
                                                               [num_col].mean(), on=join_cols, how='left').iloc[:,-1]

        df_mean_test[f'{col}_{num_col}_mean'] = df_test[join_cols].merge(df_train.groupby(by=join_cols, as_index=False)\
                                                               [num_col].mean(), on=join_cols, how='left').iloc[:,-1]

        if col == 'model': 
            df_mean_test[f'{col}_{num_col}_mean'] = df_mean_test[f'{col}_{num_col}_mean'].\
            fillna(df_mean_test[f'brand_{num_col}_mean']) 

        if col == 'model_name':
            df_mean_test[f'{col}_{num_col}_mean'] = df_mean_test[f'{col}_{num_col}_mean'].\
            fillna(df_mean_test[f'model_{num_col}_mean']).\
            fillna(df_mean_test[f'brand_{num_col}_mean'])    

        if col == 'modification':
            df_mean_test[f'{col}_{num_col}_mean'] = df_mean_test[f'{col}_{num_col}_mean'].\
            fillna(df_mean_test[f'model_name_{num_col}_mean']).fillna(df_mean_test[f'model_{num_col}_mean']).\
            fillna(df_mean_test[f'brand_{num_col}_mean'])    

        other_filler = df_mean_test[f'{col}_{num_col}_mean'].median()
        df_mean_test[f'{col}_{num_col}_mean'] = df_mean_test[f'{col}_{num_col}_mean'].fillna(other_filler)



In [435]:
# получаем индексы train и test выборок
train_idx = y_train.index.values
test_idx = y_test.index.values

# добавляем таргет и получаем 2 выборки "признак=таргет"
df_train = data_sample.loc[train_idx]
df_test = data_sample.loc[test_idx]

# пустые фреймы для mean/count encoding features
df_mean_train = pd.DataFrame(index=train_idx)
df_mean_test = pd.DataFrame(index=test_idx)

# создаем кодированные категориальные признаки
for col in cat_features:
    make_rating_mean_count(df_mean_train, df_mean_test, col, df_train, df_test, group_cols=None)
    for num_col in num_features:
        make_num_feat_mean(df_mean_train, df_mean_test, col, num_col, df_train, df_test, group_cols=None)
        
mean_columns = df_mean_test.columns

X_train_cat = np.array(df_mean_train)
X_test_cat = np.array(df_mean_test)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

### Vectorizing text features

In [436]:
vectorizer = TfidfVectorizer(stop_words='english', analyzer='word')

In [437]:
# i = 0
# for col in text_features:
#     if i==0:
#         X_train_text = vectorizer.fit_transform(X_train[col])
#         X_test_text = vectorizer.transform(X_test[col])
#         text_fi_cols = vectorizer.get_feature_names()
#         continue
#     X_train_text = sparse.hstack([X_train_text, vectorizer.fit_transform(X_train[col])])
#     X_test_text = sparse.hstack([X_test_text, vectorizer.transform(X_test[col])])
#     text_fi_cols += vectorizer.get_feature_names()

In [438]:
# X_train_final = np.hstack([X_train[num_features], X_train_cat,])# X_train_text])
X_train_final = np.hstack([X_train_cat,])# X_train_text])

In [439]:
# X_test_final = np.hstack([X_test[num_features], X_test_cat, ])#X_test_text])
X_test_final = np.hstack([X_test_cat, ])#X_test_text])

## Моделирование

In [440]:
estimator = CatBoostClassifier(depth=7, iterations=1000, learning_rate=0.1, verbose=True, random_state=42)

In [441]:
estimator.fit(X_train_final, y_train)

0:	learn: 0.6363537	total: 78.3ms	remaining: 1m 18s
1:	learn: 0.5915996	total: 177ms	remaining: 1m 28s
2:	learn: 0.5563786	total: 266ms	remaining: 1m 28s
3:	learn: 0.5286615	total: 369ms	remaining: 1m 31s
4:	learn: 0.5068532	total: 466ms	remaining: 1m 32s
5:	learn: 0.4896524	total: 581ms	remaining: 1m 36s
6:	learn: 0.4760845	total: 698ms	remaining: 1m 39s
7:	learn: 0.4653628	total: 821ms	remaining: 1m 41s
8:	learn: 0.4568625	total: 930ms	remaining: 1m 42s
9:	learn: 0.4501376	total: 1s	remaining: 1m 39s
10:	learn: 0.4447852	total: 1.11s	remaining: 1m 39s
11:	learn: 0.4405362	total: 1.18s	remaining: 1m 37s
12:	learn: 0.4371475	total: 1.27s	remaining: 1m 36s
13:	learn: 0.4344367	total: 1.37s	remaining: 1m 36s
14:	learn: 0.4322730	total: 1.45s	remaining: 1m 35s
15:	learn: 0.4305465	total: 1.49s	remaining: 1m 31s
16:	learn: 0.4291465	total: 1.57s	remaining: 1m 30s
17:	learn: 0.4280297	total: 1.68s	remaining: 1m 31s
18:	learn: 0.4271296	total: 1.77s	remaining: 1m 31s
19:	learn: 0.4264042	tot

160:	learn: 0.4212159	total: 14.2s	remaining: 1m 13s
161:	learn: 0.4211932	total: 14.3s	remaining: 1m 13s
162:	learn: 0.4211640	total: 14.3s	remaining: 1m 13s
163:	learn: 0.4211393	total: 14.4s	remaining: 1m 13s
164:	learn: 0.4211170	total: 14.5s	remaining: 1m 13s
165:	learn: 0.4210907	total: 14.5s	remaining: 1m 13s
166:	learn: 0.4210690	total: 14.6s	remaining: 1m 12s
167:	learn: 0.4210515	total: 14.7s	remaining: 1m 12s
168:	learn: 0.4210246	total: 14.8s	remaining: 1m 12s
169:	learn: 0.4210112	total: 14.8s	remaining: 1m 12s
170:	learn: 0.4209914	total: 14.9s	remaining: 1m 12s
171:	learn: 0.4209724	total: 15s	remaining: 1m 12s
172:	learn: 0.4209552	total: 15.1s	remaining: 1m 12s
173:	learn: 0.4209349	total: 15.2s	remaining: 1m 12s
174:	learn: 0.4209143	total: 15.3s	remaining: 1m 12s
175:	learn: 0.4208901	total: 15.4s	remaining: 1m 12s
176:	learn: 0.4208697	total: 15.5s	remaining: 1m 12s
177:	learn: 0.4208477	total: 15.6s	remaining: 1m 11s
178:	learn: 0.4208298	total: 15.7s	remaining: 1m

319:	learn: 0.4181993	total: 27s	remaining: 57.3s
320:	learn: 0.4181843	total: 27.1s	remaining: 57.2s
321:	learn: 0.4181623	total: 27.2s	remaining: 57.2s
322:	learn: 0.4181424	total: 27.2s	remaining: 57.1s
323:	learn: 0.4181247	total: 27.3s	remaining: 57s
324:	learn: 0.4180983	total: 27.4s	remaining: 56.9s
325:	learn: 0.4180827	total: 27.5s	remaining: 56.8s
326:	learn: 0.4180647	total: 27.5s	remaining: 56.7s
327:	learn: 0.4180489	total: 27.6s	remaining: 56.6s
328:	learn: 0.4180239	total: 27.7s	remaining: 56.5s
329:	learn: 0.4180060	total: 27.8s	remaining: 56.4s
330:	learn: 0.4179900	total: 27.8s	remaining: 56.3s
331:	learn: 0.4179703	total: 27.9s	remaining: 56.1s
332:	learn: 0.4179554	total: 28s	remaining: 56s
333:	learn: 0.4179397	total: 28.1s	remaining: 56s
334:	learn: 0.4179152	total: 28.1s	remaining: 55.8s
335:	learn: 0.4178946	total: 28.2s	remaining: 55.8s
336:	learn: 0.4178748	total: 28.3s	remaining: 55.7s
337:	learn: 0.4178617	total: 28.4s	remaining: 55.6s
338:	learn: 0.4178429	

479:	learn: 0.4154245	total: 42.4s	remaining: 45.9s
480:	learn: 0.4154056	total: 42.4s	remaining: 45.8s
481:	learn: 0.4153904	total: 42.5s	remaining: 45.7s
482:	learn: 0.4153701	total: 42.6s	remaining: 45.6s
483:	learn: 0.4153505	total: 42.7s	remaining: 45.5s
484:	learn: 0.4153356	total: 42.8s	remaining: 45.4s
485:	learn: 0.4153125	total: 42.8s	remaining: 45.3s
486:	learn: 0.4152973	total: 42.9s	remaining: 45.2s
487:	learn: 0.4152829	total: 43s	remaining: 45.1s
488:	learn: 0.4152645	total: 43.1s	remaining: 45.1s
489:	learn: 0.4152482	total: 43.2s	remaining: 45s
490:	learn: 0.4152338	total: 43.3s	remaining: 44.9s
491:	learn: 0.4152208	total: 43.4s	remaining: 44.8s
492:	learn: 0.4152032	total: 43.5s	remaining: 44.7s
493:	learn: 0.4151824	total: 43.6s	remaining: 44.6s
494:	learn: 0.4151719	total: 43.7s	remaining: 44.6s
495:	learn: 0.4151542	total: 43.8s	remaining: 44.5s
496:	learn: 0.4151389	total: 44s	remaining: 44.5s
497:	learn: 0.4151192	total: 44.1s	remaining: 44.4s
498:	learn: 0.4151

639:	learn: 0.4128354	total: 57.4s	remaining: 32.3s
640:	learn: 0.4128236	total: 57.4s	remaining: 32.2s
641:	learn: 0.4128110	total: 57.5s	remaining: 32.1s
642:	learn: 0.4127900	total: 57.6s	remaining: 32s
643:	learn: 0.4127712	total: 57.7s	remaining: 31.9s
644:	learn: 0.4127594	total: 57.7s	remaining: 31.8s
645:	learn: 0.4127440	total: 57.8s	remaining: 31.7s
646:	learn: 0.4127320	total: 57.9s	remaining: 31.6s
647:	learn: 0.4127190	total: 58s	remaining: 31.5s
648:	learn: 0.4127051	total: 58.1s	remaining: 31.4s
649:	learn: 0.4126909	total: 58.1s	remaining: 31.3s
650:	learn: 0.4126749	total: 58.2s	remaining: 31.2s
651:	learn: 0.4126566	total: 58.3s	remaining: 31.1s
652:	learn: 0.4126438	total: 58.4s	remaining: 31s
653:	learn: 0.4126290	total: 58.4s	remaining: 30.9s
654:	learn: 0.4126208	total: 58.5s	remaining: 30.8s
655:	learn: 0.4126076	total: 58.6s	remaining: 30.7s
656:	learn: 0.4125894	total: 58.7s	remaining: 30.6s
657:	learn: 0.4125646	total: 58.8s	remaining: 30.5s
658:	learn: 0.4125

798:	learn: 0.4103850	total: 1m 10s	remaining: 17.7s
799:	learn: 0.4103743	total: 1m 10s	remaining: 17.6s
800:	learn: 0.4103607	total: 1m 10s	remaining: 17.5s
801:	learn: 0.4103473	total: 1m 10s	remaining: 17.4s
802:	learn: 0.4103343	total: 1m 10s	remaining: 17.3s
803:	learn: 0.4103245	total: 1m 10s	remaining: 17.2s
804:	learn: 0.4103107	total: 1m 10s	remaining: 17.1s
805:	learn: 0.4102910	total: 1m 10s	remaining: 17s
806:	learn: 0.4102744	total: 1m 10s	remaining: 16.9s
807:	learn: 0.4102619	total: 1m 10s	remaining: 16.8s
808:	learn: 0.4102466	total: 1m 10s	remaining: 16.8s
809:	learn: 0.4102294	total: 1m 11s	remaining: 16.7s
810:	learn: 0.4102189	total: 1m 11s	remaining: 16.6s
811:	learn: 0.4102056	total: 1m 11s	remaining: 16.5s
812:	learn: 0.4101951	total: 1m 11s	remaining: 16.4s
813:	learn: 0.4101746	total: 1m 11s	remaining: 16.3s
814:	learn: 0.4101582	total: 1m 11s	remaining: 16.2s
815:	learn: 0.4101427	total: 1m 11s	remaining: 16.1s
816:	learn: 0.4101273	total: 1m 11s	remaining: 1

954:	learn: 0.4081422	total: 1m 22s	remaining: 3.9s
955:	learn: 0.4081288	total: 1m 22s	remaining: 3.81s
956:	learn: 0.4081115	total: 1m 22s	remaining: 3.73s
957:	learn: 0.4080963	total: 1m 23s	remaining: 3.64s
958:	learn: 0.4080840	total: 1m 23s	remaining: 3.55s
959:	learn: 0.4080720	total: 1m 23s	remaining: 3.46s
960:	learn: 0.4080649	total: 1m 23s	remaining: 3.38s
961:	learn: 0.4080511	total: 1m 23s	remaining: 3.29s
962:	learn: 0.4080300	total: 1m 23s	remaining: 3.2s
963:	learn: 0.4080210	total: 1m 23s	remaining: 3.12s
964:	learn: 0.4080034	total: 1m 23s	remaining: 3.03s
965:	learn: 0.4079916	total: 1m 23s	remaining: 2.94s
966:	learn: 0.4079824	total: 1m 23s	remaining: 2.86s
967:	learn: 0.4079648	total: 1m 23s	remaining: 2.77s
968:	learn: 0.4079473	total: 1m 23s	remaining: 2.69s
969:	learn: 0.4079377	total: 1m 24s	remaining: 2.6s
970:	learn: 0.4079242	total: 1m 24s	remaining: 2.51s
971:	learn: 0.4079087	total: 1m 24s	remaining: 2.42s
972:	learn: 0.4078919	total: 1m 24s	remaining: 2.

<catboost.core.CatBoostClassifier at 0x7fecb8121190>

In [442]:
y_pred = estimator.predict_proba(X_test_final)

In [443]:
y_pred_1 = estimator.predict(X_test_final)

In [444]:
ros = roc_auc_score(y_test.values, y_pred[:,1])

In [445]:
ros

0.5008903175154342

In [446]:
f1_score(y_test.values, y_pred_1)

0.9185800237704547

In [238]:
# fi_cols = num_features + mean_columns.tolist() + text_fi_cols

In [239]:
# fi_df = pd.DataFrame(zip(estimator.feature_importances_, fi_cols), columns=['importance', 'name'])

In [240]:
# fi_df.sort_values(by='importance', ascending=False).head(50)

Unnamed: 0,importance,name
9,31.176258,userid_rating_mean
10,24.882241,userid_count
8,6.634112,asin_count
17,3.3407,itemid_count
11,3.28246,userid_unixReviewTime_mean
0,3.26382,unixReviewTime
18,3.04655,itemid_unixReviewTime_mean
15,2.552658,userid_also_buy_mean
12,2.546164,userid_price_mean
16,2.477175,itemid_rating_mean


In [88]:
# rebuilt_data['sqerror'] = (rebuilt_data.rating - rebuilt_data.pred)**2

In [103]:
# cols = num_features + cat_features + text_features

In [119]:
# rebuilt_data.columns

Index(['verified', 'asin', 'reviewText', 'summary', 'unixReviewTime', 'userid',
       'itemid', 'description', 'title', 'brand', 'rank', 'also_view',
       'main_cat', 'price', 'also_buy', 'category_0', 'category_1',
       'category_2', 'category_3', 'rating', 'pred', 'sqerror'],
      dtype='object')

In [170]:
# rebuilt_data.groupby(['main_cat']).sqerror.agg(['mean', 'count']).\
# sort_values(by=['mean'],ascending=False).head(50)

Unnamed: 0_level_0,mean,count
main_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
Pet Supplies,0.5,2
Tools & Home Improvement,0.5,2
Toys & Games,0.357143,14
Baby,0.333333,3
No cat,0.266667,30
Sports & Outdoors,0.222222,36
Grocery,0.171473,24931
Amazon Home,0.171171,222
Health & Personal Care,0.16632,962
Industrial & Scientific,0.142857,21


In [36]:
# f1_score(y_test.values, y_pred_1)

In [37]:
# fpr, tpr, thresholds = roc_curve(y_test.values, y_pred[:,1])

In [38]:
# sns.scatterplot(x=fpr, y=tpr)

### Prepare LightFM с item/user features

In [382]:
# features_user = X_train[['verified', 'userid', 'also_view', 'main_cat', 'also_buy']]

In [383]:
# features_item = X_train[['itemid', 'brand', 'rank', 
#                              'main_cat', 'price', 'category_0', 
#                              'category_1', 'category_2', 'category_3']]

In [404]:
# df = pd.concat([X_train[['userid', 'itemid',]], y_train], axis=1)
# df_test = pd.concat([X_test[['userid', 'itemid',]], y_test], axis=1)

In [447]:
features_user = pd.concat([df_mean_train[['verified_rating_mean', 'verified_count',
                                         'userid_rating_mean', 'userid_count', 
                                         'userid_unixReviewTime_mean', 'userid_price_mean', 
                                         'userid_rank_mean', 
                                         'userid_also_view_mean', 'userid_also_buy_mean']], 
                           X_train['userid']], axis=1)

features_item = pd.concat([df_mean_train[['asin_rating_mean', 'asin_count', 'itemid_rating_mean','itemid_count', 
                                          'itemid_unixReviewTime_mean', 'itemid_price_mean', 
                                          'itemid_rank_mean', 'itemid_also_view_mean',
                                          'itemid_also_buy_mean']], 
                           X_train['itemid']], axis=1)

df = pd.concat([X_train[['userid', 'itemid']], y_train], axis=1)
df_test = pd.concat([X_test[['userid', 'itemid']], y_test], axis=1)

In [448]:
item_f = []
col = []
unique_f1 = []
for column in features_item.drop(['itemid'], axis=1):
    col += [column]*len(features_item[column].unique())
    unique_f1 += list(features_item[column].unique())
for x,y in zip(col, unique_f1):
    res = str(x)+ ":" +str(y)
    item_f.append(res)
    
user_f = []
col = []
unique_f1 = []
for column in features_user.drop(['userid'], axis=1):
    col += [column]*len(features_user[column].unique())
    unique_f1 += list(features_user[column].unique())
for x,y in zip(col, unique_f1):
    res = str(x)+ ":" +str(y)
    user_f.append(res)

In [449]:
from lightfm.data import Dataset
# we call fit to supply userid, item id and user/item features
dataset = Dataset()
dataset.fit(
        data_sample['userid'].unique(), # all the users
        data_sample['itemid'].unique(), # all the items
        user_features = user_f,
        item_features = item_f
)

In [450]:
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in df.values ])

In [451]:
(interactions_test, weights_test) = dataset.build_interactions([(x[0], x[1], x[2]) for x in df_test.values ])

### item_features

In [452]:
temp_list = []
for column in features_item.drop(['itemid'], axis=1):
    temp_list.append(column + ':')

In [453]:
def transform_to_list(row):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = [1,1,0,'del'],
    resultant output = ['f1:1', 'f2:1', 'f3:0', 'loc:del']

    """
    result = []
    temp_row = row
    for x,y in zip(temp_list, temp_row):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

In [454]:
subset = features_item.drop(['itemid'], axis=1)
ad_list = [x.tolist() for x in subset.values]
item_feature_list = []
for item in ad_list:
    item_feature_list.append(transform_to_list(item))

In [455]:
item_tuple = list(zip(features_item.itemid, item_feature_list))

In [456]:
item_features = dataset.build_item_features(item_tuple, normalize= False)

### user_features

In [457]:
temp_list = []
for column in features_user.drop(['userid'], axis=1):
    temp_list.append(column + ':')

In [458]:
subset = features_user.drop(['userid'], axis=1)
ad_list = [x.tolist() for x in subset.values]
user_feature_list = []
for user in ad_list:
    user_feature_list.append(transform_to_list(user))

In [459]:
user_tuple = list(zip(features_user.userid, user_feature_list))

In [460]:
user_features = dataset.build_user_features(user_tuple, normalize= False)

In [461]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [None]:
model = LightFM(learning_rate=lr, loss='logistic', no_components=comp, 
                            learning_schedule='adagrad')
model.fit(interactions, # spase matrix representing whether user u and item i interacted
    user_features = user_features,
    item_features = item_features, # we have built the sparse matrix above
    sample_weight = weights, # spase matrix representing how much value to give to user u and item i inetraction: i.e ratings
    epochs=10)

In [None]:
user_ids = df.userid.apply(lambda x: user_id_map[x])
item_ids = df.itemid.apply(lambda x: item_id_map[x])
preds = model.predict(user_ids.values, item_ids.values, 
                      user_features=user_features, item_features=item_features)

In [None]:
train_auc = auc_score(model=model,
#                       train_interactions=interactions,
                      test_interactions=interactions,
                      user_features = user_features,
                      item_features=item_features, 
                      check_intersections=True
                     ).mean()

In [None]:
train_auc

In [None]:
test_auc = auc_score(model=model,
#                       train_interactions=interactions,
                      test_interactions=interactions_test,
                      user_features = user_features,
                      item_features=item_features, 
                      check_intersections=True
                     ).mean()

In [None]:
test_auc

In [None]:
preds_norm = (preds-preds.min()) / (preds-preds.min()).max()

In [None]:
roc_auc_score(df.rating, preds_norm)