In [None]:
import sys
import os
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import choix
import networkx as nx
import time
from pandas.core.frame import DataFrame
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
np.set_printoptions(precision=3, suppress=True)
SEED=2022
def seed_all(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
seed_all(SEED)

# 1. Data preprocess

## 1.1 View data

In [None]:
# import dataset
battle = pd.read_csv('battle.csv')
recipe = pd.read_excel('recipe.xlsx')
order = pd.read_csv('order.csv')

In [None]:
print(battle.shape)
print(recipe.shape)
print(order.shape)

## 1.2 Missing values

In [None]:
#Missing values
print(battle.isnull().any())
print('\n')
print(recipe.isnull().any())
print('\n')
print(order.isnull().any())

In [None]:
battle.isnull().sum()

In [None]:
recipe.isnull().sum()

In [None]:
recipe.isnull().sum()/3047

In [None]:
#drop missing values: records with missing anonymised_user_id in battle dataset
battle.dropna(axis=0, how='any', inplace=True)
battle.reset_index(drop=True, inplace=True)

1. battle.csv: anonymised_user_id has missing values,I can drop those lines directly.
2. recipe.xlsx: recipe_name has no missing values, and I can use it to identify each recipe. 
3. order.csv: There are no missing values.

In [None]:
print(battle.shape)
print(recipe.shape)
print(order.shape)

In [None]:
user_info=battle.groupby('anonymised_user_id',as_index=False).count()
user_info=user_info[['anonymised_user_id','num_of_battles']]
# add a column 'user_id'
user_info['user_id']=np.arange(0,user_info.shape[0])
battle=pd.merge(battle, user_info, how = 'inner', on='anonymised_user_id')

In [None]:
# how many users took this battle game? how many recipes? and how many comparison totally?
print('The total number of comparision:', battle.shape [0])
user = list(set(battle.anonymised_user_id.unique()))
print('The total number of users: ', len(user))
item1 = list(np.sort(list(set(battle.recipe_1.unique()))))
item2 = list(np.sort(list(set(battle.recipe_2.unique()))))
if item1 == item2 : 
    print('The total number of recipes: ', len(item1))

## 1.3 Feature processing

### 1.Battle dataset

In [None]:
#Transform ‘anonymised_user_id’ from float to int
battle['anonymised_user_id']=battle['anonymised_user_id'].apply(int)

### 2. Recipe dataset

In [None]:
# add item_id:[0,1,2,3..]
recipe['item_id']= np.arange(0,recipe.shape[0])

In [None]:
# Grouping preparation_time_for_2
recipe['preparation_time_for_2']=pd.cut(recipe['preparation_time_for_2'],[0,20,40,60,80,100,120],
                                        labels=['(0-20]','(20-40]','(40-60]','(60-80]','(80-100]','(100-120]'])

In [None]:
def String_Split(string, separators):
    result_split = [string]
    for sep in separators:
        string_temp = []
        list(
                map(
                    lambda sub_string: string_temp.extend(sub_string.split(sep)),
                    result_split
                    )
                )
        result_split = string_temp

    return result_split


def dish_types_to_category(df):
    d= {item :[] for item in dish_types}
    
    def f(row):
        dish = String_Split(str(row.dish_types), ['|','&','/'])
        for item in dish_types:
            if item in dish:
                d[item].append(1)
            else:
                d[item].append(0)

                
    df.apply(f, axis=1)
    # add dish category
    dish_df = pd.DataFrame(d, columns=dish_types)
    df = pd.concat([df, dish_df], axis=1)
    return df

def dish_to_category(df):
    d= {item :[] for item in dish_categories}
    
    def f(row):
        dish = String_Split(str(row.dish_categories), ['|','&','/'])
        for item in dish_categories:
            if item in dish:
                d[item].append(1)
            else:
                d[item].append(0)
    df.apply(f, axis=1)
    # add dish category
    dish_df = pd.DataFrame(d, columns=dish_categories)
    df = pd.concat([df, dish_df], axis=1)
    return df


def health_to_category(df):
    d = {item :[] for item in health_types}
    
    def f(row):
        health = str(row.health_attributes).split('|')
        for item in health_types:
            if item in health:
                d[item].append(1)
            else:
                d[item].append(0)

    # create health category dict
    df.apply(f, axis=1)
    
    # add health category
    health_df = pd.DataFrame(d, columns=health_types)
    df = pd.concat([df, health_df], axis=1)
    return df

In [None]:
# dish_types
dish_types = ['BBQ','Burger','Sandwich','Hotdog','Tacos','Pasta','Roast','Wraps','Pie',
              'Soya','Curry','Burritos','Pizza',
              'Dal','Chowder','Gnocchi','Laksa',
              'Risotto','Stew','Stir Fry','Pilaf','Tray Bake',]
recipe = dish_types_to_category(recipe)

# dish_categories
dish_categories=['Finger food','Oven','Salads','Stove top','bowl food','Protein','Veg','Soups']
recipe = dish_to_category(recipe)

# health_attributes
health_types = ['Health Exception','Healthy','Low Cal','Low Carb','Not Healthy','Wholegrain']
recipe = health_to_category(recipe)

In [None]:
# Choose necessary features from recipe dataset
rf=recipe[['recipe_name','item_id','recipe_uuid','preparation_time_for_2','cuisine_name',
           'main_carb_name','spice_level','protein_name','protein_category_name','BBQ',
       'Burger', 'Sandwich', 'Hotdog', 'Tacos', 'Pasta', 'Roast', 'Wraps',
       'Pie', 'Soya', 'Curry', 'Burritos', 'Pizza', 'Dal', 'Chowder',
       'Gnocchi', 'Laksa', 'Risotto', 'Stew', 'Stir Fry', 'Pilaf', 'Tray Bake',
       'Finger food', 'Oven', 'Salads', 'Stove top', 'bowl food', 'Protein',
       'Veg', 'Soups', 'Health Exception', 'Healthy', 'Low Cal', 'Low Carb',
       'Not Healthy', 'Wholegrain']]


In [None]:
# unify carb name for same carb
# Basmati
f = lambda s: 'Basmati' if 'Basmati' in str(s['main_carb_name'])  else s['main_carb_name']
rf['main_carb_name'] = rf.apply(f, axis=1)
# Brown rice
f2 = lambda s: 'Brown rice' if 'Brown rice' in str(s['main_carb_name'])  else s['main_carb_name']
rf['main_carb_name'] = rf.apply(f2, axis=1)
# Pizza base
f3 = lambda s: 'Pizza Base' if 'Pizza Base' in str(s['main_carb_name'])  else s['main_carb_name']
rf['main_carb_name'] = rf.apply(f3, axis=1)
#None
f4 = lambda s: None if str(s['main_carb_name'])=='None' else s['main_carb_name']
rf['main_carb_name'] = rf.apply(f4, axis=1)
# None in protein_name
f5 = lambda s: None if str(s['protein_name'])=='None' else s['protein_name']
rf['protein_name'] = rf.apply(f5, axis=1)
# None in protein_category_name
f6 = lambda s: None if str(s['protein_category_name'])=='None' else s['protein_category_name']
rf['protein_category_name'] = rf.apply(f6, axis=1)

In [None]:
rf[['cuisine_name','main_carb_name','spice_level','protein_name','protein_category_name']]=rf[['cuisine_name','main_carb_name','spice_level','protein_name','protein_category_name']].astype('category')

### 3. Order dataset

In [None]:
# Merge Recipe dataset and Order dataset on uuid
order2=pd.merge(order, recipe, how = 'left', left_on='menu_recipe_uuid', right_on = 'recipe_uuid')
order2=order2[['anonymised_user_id','period_id', 'menu_recipe_id', 'menu_recipe_uuid',
               'recipe_name','ordered','item_id']]

In [None]:
# Choose necessary columns
order2=pd.merge(order2, user_info, how = 'left', left_on='anonymised_user_id', right_on = 'anonymised_user_id')

# 2. Model

## 2.1 BT model-Obtain users' preferences for 30 recipes entered

This function computes the maximum-likelihood (ML) estimate of model parameters given pairwise-comparison data, using the Newton-CG algorithm.

In [None]:
t=list(recipe.recipe_name)
data= collections.defaultdict(list)
for i in range(len(battle.recipe_1)):
    if battle.chosen_position[i]==1:
        el = (t.index(battle.recipe_1[i]), t.index(battle.recipe_2[i]))
        
    else:
        el = (t.index(battle.recipe_2[i]), t.index(battle.recipe_1[i]))
    data[battle.user_id[i]].append(el)

In [None]:
G= nx.DiGraph()
G.add_edges_from(data[2])
nx.draw(G, with_labels=True)
plt.savefig('comparison result: user=2.png')
plt.show()
##nx.draw(graph, with_labels=True)
##plt.savefig("path.png")

In [None]:
def user_preference(data,alpha):
    preferences = []
    for user, comps in data.items():
        params = choix.opt_pairwise(30, data[user],alpha=alpha)
        preferences.append(params)
    
    return preferences



In [None]:
alphas=[1e-4, 0.001,0.01,0.1]
UR=[]
for alpha in alphas:
  rank_opt=user_preference(data,alpha)
  UR.append(rank_opt)
#DataFrame(rank_opt).to_excel('U_R_by_opt.xlsx')


In [None]:
UR[3][2]

## 2.2 User Preferences

### 2.2.1 Reciple profile: RF

In [None]:
# Create a recipe-feature matrix
rf_time=pd.get_dummies(rf['preparation_time_for_2'])
rf_cui=pd.get_dummies(rf['cuisine_name'])
rf_carb=pd.get_dummies(rf['main_carb_name']) #some labels can be combined
rf_spice=pd.get_dummies(rf['spice_level'])
rf_protein=pd.get_dummies(rf['protein_name'])
rf_protein_category=pd.get_dummies(rf['protein_category_name'])
rf_dish_health = rf[['BBQ','Burger', 'Sandwich', 'Hotdog', 'Tacos', 'Pasta', 'Roast', 'Wraps',
       'Pie', 'Soya', 'Curry', 'Burritos', 'Pizza', 'Dal', 'Chowder',
       'Gnocchi', 'Laksa', 'Risotto', 'Stew', 'Stir Fry', 'Pilaf', 'Tray Bake',
       'Finger food', 'Oven', 'Salads', 'Stove top', 'bowl food', 'Protein',
       'Veg', 'Soups', 'Health Exception', 'Healthy', 'Low Cal', 'Low Carb',
       'Not Healthy', 'Wholegrain']]
R_F = pd.concat([rf_cui,rf_carb,rf_spice,rf_protein,rf_protein_category,rf_dish_health,rf_time],axis=1)


In [None]:
# Gnocchi
R_F['Gnocchi_all']=np.sum(np.array(R_F['Gnocchi']),axis=1)
f7 = lambda s: 1 if int(s['Gnocchi_all'])>=1 else 0
R_F['Gnocchi_all'] = R_F.apply(f7, axis=1)
# Fruit & Vegetables
R_F['Fruit_Vegetables_all']=np.sum(np.array(R_F['Fruit & Vegetables']),axis=1)
f8 = lambda s: 1 if int(s['Fruit_Vegetables_all'])>=1 else 0
R_F['Fruit_Vegetables_all'] = R_F.apply(f8, axis=1)
# Nuts & Seeds
R_F['Nuts_Seeds_all']=np.sum(np.array(R_F['Nuts & Seeds']),axis=1)
f9 = lambda s: 1 if int(s['Nuts_Seeds_all'])>=1 else 0
R_F['Nuts_Seeds_all'] = R_F.apply(f9, axis=1)

In [None]:
R_F.drop(['Gnocchi','Fruit & Vegetables','Nuts & Seeds'],axis=1,inplace=True)
R_F.rename(columns= {'Gnocchi_all': 'Gnocchi', 'Fruit_Vegetables_all':'Fruit & Vegetables',
                    'Nuts_Seeds_all':'Nuts & Seeds'}, inplace=True)

In [None]:
# calculate similarity between i and j
sim=cosine_similarity(R_F)
# draw a correlation figure

In [None]:
print(R_F.shape)
np.array(R_F)[0]

### 2.2.2 User preferences Matrix : User-Feature = User-Item * Item-Feature

In [None]:
def create_UF(U_R, R_F):
  U_F=DataFrame(np.dot(U_R,R_F.iloc[:30,:]))
  U_F.columns = list(R_F.columns)
  return U_F


In [None]:
UF=[]
for i in UR:
  df_rank_opt=DataFrame(i)
  U_F_opt=create_UF(df_rank_opt, R_F)
  UF.append(U_F_opt)
#U_F_opt.to_excel('U_F_opt.xlsx')


In [None]:
print(UF[3].shape)
np.array(UF[3])

## 2.3 Learning to rank

### 2.3.1 MCDA

In [None]:
# Obtain each recipes' utility for each user by  additive value function approach
AVF=[]
for uf in UF:
  u_avf=np.dot(uf,R_F.T)
  R_all_avf=DataFrame(u_avf.argsort()[:,::-1])
  AVF.append(R_all_avf)

In [None]:
print(AVF[3].shape)
np.array(AVF[3])

### 2.3.2 LightGBM

#### 2.3.2.1 Construct data (X, y)

In [None]:
# create user features
ranking=DataFrame(UR[3]).reset_index()
ranking=ranking.melt(id_vars='index', var_name='item_id',value_name='rating').rename(columns = {"index" : 'user_id'})
ranking=ranking.sort_values(by=['user_id','item_id'],ascending=True).reset_index(drop=True)
ranking['ranking']=ranking['rating'].groupby(ranking['user_id']).rank(ascending=1, method='dense')
ranking['rating_mean'] = ranking.groupby('user_id')['rating'].transform('mean')
ranking['ranking_mean'] = ranking.groupby('user_id')['ranking'].transform('mean')

In [None]:
# add columns: mactch(user-recipe)
# convert the value of UF to Max-Mix [-1,1]
df = DataFrame(UF[3]).T
user_max= df.max()
user_min= df.min()
df[df>0]=df[df>0]/user_max
df[df<0]=-df[df<0]/user_min
df_U_F=df.T
#  calculate mactch(U-F, R-F)
ur_Cos=cosine_similarity(df_U_F,R_F)
# add the value into ranking dataframe
f10 = lambda s: ur_Cos[int(s.user_id)][int(s.item_id)]
ranking['match_between_user_recipe'] = ranking.apply(f10, axis=1)

In [None]:
# # add coulumns: utility(user-recipe)
f11 = lambda s: u_avf[int(s.user_id)][int(s.item_id)]
ranking['utility_between_user_recipe'] = ranking.apply(f11, axis=1)

In [None]:
# merge recipe and ranking
merged_df = rf.merge(ranking, left_on='item_id', right_on='item_id', how='inner')
print(merged_df.shape)
merged_df.head()

#### 2.3.2.2 Train model

In [None]:
# random split
train, test = train_test_split(merged_df, test_size=0.2, random_state=SEED)
print('train shape: ',train.shape)
print('test shape: ',test.shape)

In [None]:
features = ['preparation_time_for_2',
       'cuisine_name', 'main_carb_name', 'spice_level', 'protein_name',
       'protein_category_name', 'BBQ', 'Burger', 'Sandwich', 'Hotdog', 'Tacos',
       'Pasta', 'Roast', 'Wraps', 'Pie', 'Soya', 'Curry', 'Burritos', 'Pizza',
       'Dal', 'Chowder', 'Gnocchi', 'Laksa', 'Risotto', 'Stew', 'Stir Fry',
       'Pilaf', 'Tray Bake', 'Finger food', 'Oven', 'Salads', 'Stove top',
       'bowl food', 'Protein', 'Veg', 'Soups', 'Health Exception', 'Healthy',
       'Low Cal', 'Low Carb', 'Not Healthy', 'Wholegrain','rating_mean','ranking_mean','match_between_user_recipe'
           ]
feature2=['preparation_time_for_2',
       'cuisine_name', 'main_carb_name', 'spice_level', 'protein_name',
       'protein_category_name', 'BBQ', 'Burger', 'Sandwich', 'Hotdog', 'Tacos',
       'Pasta', 'Roast', 'Wraps', 'Pie', 'Soya', 'Curry', 'Burritos', 'Pizza',
       'Dal', 'Chowder', 'Gnocchi', 'Laksa', 'Risotto', 'Stew', 'Stir Fry',
       'Pilaf', 'Tray Bake', 'Finger food', 'Oven', 'Salads', 'Stove top',
       'bowl food', 'Protein', 'Veg', 'Soups', 'Health Exception', 'Healthy',
       'Low Cal', 'Low Carb', 'Not Healthy', 'Wholegrain']
user_col = 'user_id'
item_col = 'item_id'
target_col = 'ranking'

In [None]:
train = train.sort_values('user_id').reset_index(drop=True)
test = test.sort_values('user_id').reset_index(drop=True)

In [None]:
# paramater: 'group'
train_query = train[user_col].value_counts().sort_index()
test_query = test[user_col].value_counts().sort_index()

##### model 2 with user, user-recipe features

In [None]:
# try parameter tuning
def objective(trial):
    # search param
    param = {
        'reg_alpha': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1), 
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100), 
    }
     
    
    #train model
    model = lgb.LGBMRanker(n_estimators=1000, **param, random_state=SEED,)
    model.fit(
        train[features],
        train[target_col],
        categorical_feature=['preparation_time_for_2','cuisine_name','main_carb_name',
                             'spice_level','protein_name','protein_category_name'],
        group=train_query,
        eval_set=[(test[features], test[target_col])],
        eval_group=[list(test_query)],
        eval_at=[1, 3, 5, 10, 20], # calc validation ndcg@1,3,5,10,20
        early_stopping_rounds=50,
        verbose=10
    )
    
    # maximize mean ndcg
    scores = []
    for name, score in model.best_score_['valid_0'].items():
        scores.append(score)
    return np.mean(scores)
 
study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(seed=SEED) #fix random seed
                           )
study.optimize(objective, n_trials=10)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
# train with best params
best_params = study.best_trial.params
model = lgb.LGBMRanker(n_estimators=1000, **best_params, random_state=SEED,)
model.fit(
    train[features],
    train[target_col],
    categorical_feature=['preparation_time_for_2','cuisine_name','main_carb_name',
                             'spice_level','protein_name','protein_category_name'],
    group=train_query,
    eval_set=[(test[features], test[target_col])],
    eval_group=[list(test_query)],
    eval_at=[1, 3, 5, 10, 20],
    early_stopping_rounds=50,
    verbose=10
)

In [None]:
# feature imporance
plt.figure(figsize=(17, 7))
df_plt = pd.DataFrame({'feature_name': features, 'feature_importance': model.feature_importances_})
df_plt.sort_values('feature_importance', ascending=False, inplace=True)
sns.barplot(x="feature_importance", y="feature_name", data=df_plt)
plt.title('feature importance')
plt.tight_layout()
plt.savefig('feature_importance_utility(ur)4.png')

##### model 1 with only recipe features

In [None]:
# try parameter tuning
def objective(trial):
    # search param
    param = {
        'reg_alpha': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1), 
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100), 
    }
     
    
    #train model
    model2 = lgb.LGBMRanker(n_estimators=1000, **param, random_state=SEED,)
    model2.fit(
        train[feature2],
        train[target_col],
        categorical_feature=['preparation_time_for_2','cuisine_name','main_carb_name',
                             'spice_level','protein_name','protein_category_name'],
        group=train_query,
        eval_set=[(test[feature2], test[target_col])],
        eval_group=[list(test_query)],
        eval_at=[1, 3, 5, 10, 20], # calc validation ndcg@1,3,5,10,20
        early_stopping_rounds=50,
        verbose=10
    )
    
    # maximize mean ndcg
    scores = []
    for name, score in model2.best_score_['valid_0'].items():
        scores.append(score)
    return np.mean(scores)
 
study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(seed=SEED) #fix random seed
                           )
study.optimize(objective, n_trials=10)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
# train with best params
best_params2 = study.best_trial.params
model2 = lgb.LGBMRanker(n_estimators=1000, **best_params2, random_state=SEED,)
model2.fit(
    train[feature2],
    train[target_col],
    categorical_feature=['preparation_time_for_2','cuisine_name','main_carb_name',
                             'spice_level','protein_name','protein_category_name'],
    group=train_query,
    eval_set=[(test[feature2], test[target_col])],
    eval_group=[list(test_query)],
    eval_at=[1, 3, 5, 10, 20],
    early_stopping_rounds=50,
    verbose=10
)

In [None]:
# feature imporance
plt.figure(figsize=(17, 7))
df_plt = pd.DataFrame({'feature_name': feature2, 'feature_importance': model2.feature_importances_})
df_plt.sort_values('feature_importance', ascending=False, inplace=True)
sns.barplot(x="feature_importance", y="feature_name", data=df_plt)
plt.title('feature importance without user features')
plt.tight_layout()
plt.savefig('feature_importance_no_uf.png')

#### 2.3.2.3 ranking

##### model2

In [None]:
# Recommend recipes to user
def recommend_for_user(u, m, r, uf, match):
    user_df = uf.query('user_id==@u')
    r['ranking_mean']=user_df.ranking_mean.values[0]
    r['rating_mean']=user_df.rating_mean.values[0]
    # add match_between_user_recipe
    f11 = lambda s: match[u][int(s.item_id)]
    r['match_between_user_recipe'] = r.apply(f11, axis=1)

    # recommend
    preds = m.predict(r[features])    
    topk_idx = np.argsort(preds)[::-1]
 

    return topk_idx

In [None]:
user_feature=ranking[['ranking_mean','rating_mean']].groupby(ranking['user_id']).mean()
R_all_LGBM = []
for user_id in range(df_U_F.shape[0]):
  rl=recommend_for_user(user_id, model, rf, user_feature,ur_Cos )
  R_all_LGBM.append(rl)
  

In [None]:
print(DataFrame(R_all_LGBM).shape)
np.array(DataFrame(R_all_LGBM))

##### model 1

In [None]:
# Recommend recipes to user
def recommend_for_user2(u, m, r):

    # recommend
    preds = m.predict(r[feature2])    
    topk_idx = np.argsort(preds)[::-1]
 

    return topk_idx

In [None]:
R_all_LGBM2 = []
for user_id in range(df_U_F.shape[0]):
  rl=recommend_for_user2(user_id, model2, rf)
  R_all_LGBM2.append(rl)

In [None]:
R_all_LGBM2=DataFrame(R_all_LGBM2)
print(R_all_LGBM2.shape)
np.array(R_all_LGBM2)

# 3. Results

## Metrics

In [None]:
# Evaluation matrics: precision, recall, lift score, weight_score, diversity
def precision_score(R,O,k):
    inter = [i for i in R if i in O]
    return len(inter)/k

def recall_score(R,O):
    inter = [i for i in O if i in R]
    return len(inter)/len(O)

def lift_score(precision, O):
    lift = precision / (len(O)/3047)
    return lift

def weighted_score(R,O):
    inter = [i for i in R if i in O]
    swp=0
    iswp=0
    for i in inter:
        swp += 1/ np.log(R.index(i)+2)
        iswp += 1/ np.log(inter.index(i)+2)
    if iswp==0:
      nswp=0
    else:
      nswp = swp/iswp
    
    return swp, nswp

def diversity_score(R,sim,k):
    dif=0
    for pair in list(combinations(R, 2)):
        dif += sim[pair[0]][pair[1]]
    div=1- (dif*2)/(k*(k-1))
    return div

## 3.1.1 model 3 - MCDA

In [None]:
ordered = order2[order2.ordered==1]
for avf in AVF:
  print('---------------------')
  for k in [50, 100, 200, 300]:
      precision_all=0
      recall_all = 0
      lift_all = 0
      swp_all = 0
      nswp_all = 0
      diversity_all=0
      
      
      for user_id in range(avf.shape[0]):
          user=user_id
          recommend_df = avf.iloc[user,:k]
          R=list(recommend_df)
          O=list(ordered[(ordered.user_id==user)].item_id)
          # calculate 5 evaluation metrics
          precision=precision_score(R, O, k)
          recall=recall_score(R,O)
          lift= lift_score(precision, O)
          swp, nswp =  weighted_score(R,O)
          diversity=diversity_score(R,sim,k)

          precision_all += precision
          recall_all += recall
          lift_all += lift
          swp_all += swp
          nswp_all += nswp
          diversity_all += diversity

      precision_mean = precision_all/ (user_info.shape[0])
      recall_mean = recall_all/ (user_info.shape[0])
      lift_mean = lift_all/ (user_info.shape[0])
      swp_mean = swp_all/ (user_info.shape[0])
      nswp_mean = nswp_all/ (user_info.shape[0])
      diversity_mean = diversity_all/ (user_info.shape[0])

      print(f'Top {k}, precision: {precision_mean}')
      print(f'Top {k}, recall: {recall_mean}')
      print(f'Top {k}, lift: {lift_mean}')
      print(f'Top {k}, swp: {swp_mean}')
      print(f'Top {k}, nswp: {nswp_mean}')
      print(f'Top {k}, diversity: {diversity_mean}')

## 3.1.2 LightGBM Ranker

### model 2

In [None]:
ordered = order2[order2.ordered==1]
R_all_LGBM=DataFrame(R_all_LGBM)

for k in [50, 100, 200, 300]:
  precision_all=0
  recall_all = 0
  lift_all = 0
  swp_all = 0
  nswp_all = 0
  diversity_all=0

  for user_id in range(R_all_LGBM.shape[0]):
    user=user_id
    recommend_df = R_all_LGBM.iloc[user,:k]
    R=list(recommend_df)
    O=list(ordered[(ordered.user_id==user)].item_id)
    # calculate 5 evaluation metrics
    precision=precision_score(R, O, k)
    recall=recall_score(R,O)
    lift= lift_score(precision, O)
    swp, nswp =  weighted_score(R,O)
    diversity=diversity_score(R,sim,k)

    precision_all += precision
    recall_all += recall
    lift_all += lift
    swp_all += swp
    nswp_all += nswp
    diversity_all += diversity

  precision_mean = precision_all/ (user_info.shape[0])
  recall_mean = recall_all/ (user_info.shape[0])
  lift_mean = lift_all/ (user_info.shape[0])
  swp_mean = swp_all/ (user_info.shape[0])
  nswp_mean = nswp_all/ (user_info.shape[0])
  diversity_mean = diversity_all/ (user_info.shape[0])

  print(f'Top {k}, precision: {precision_mean}')
  print(f'Top {k}, recall: {recall_mean}')
  print(f'Top {k}, lift: {lift_mean}')
  print(f'Top {k}, swp: {swp_mean}')
  print(f'Top {k}, nswp: {nswp_mean}')
  print(f'Top {k}, diversity: {diversity_mean}')

### model 1

In [None]:
ordered = order2[order2.ordered==1]
R_all_LGBM2=DataFrame(R_all_LGBM2)

for k in [50, 100, 200, 300]:
  precision_all=0
  recall_all = 0
  lift_all = 0
  swp_all = 0
  nswp_all = 0
  diversity_all=0

  for user_id in range(R_all_LGBM2.shape[0]):
    user=user_id
    recommend_df = R_all_LGBM2.iloc[user,:k]
    R=list(recommend_df)
    O=list(ordered[(ordered.user_id==user)].item_id)
    # calculate 5 evaluation metrics
    precision=precision_score(R, O, k)
    recall=recall_score(R,O)
    lift= lift_score(precision, O)
    swp, nswp =  weighted_score(R,O)
    diversity=diversity_score(R,sim,k)

    precision_all += precision
    recall_all += recall
    lift_all += lift
    swp_all += swp
    nswp_all += nswp
    diversity_all += diversity

  precision_mean = precision_all/ (user_info.shape[0])
  recall_mean = recall_all/ (user_info.shape[0])
  lift_mean = lift_all/ (user_info.shape[0])
  swp_mean = swp_all/ (user_info.shape[0])
  nswp_mean = nswp_all/ (user_info.shape[0])
  diversity_mean = diversity_all/ (user_info.shape[0])

  print(f'Top {k}, precision: {precision_mean}')
  print(f'Top {k}, recall: {recall_mean}')
  print(f'Top {k}, lift: {lift_mean}')
  print(f'Top {k}, swp: {swp_mean}')
  print(f'Top {k}, nswp: {nswp_mean}')
  print(f'Top {k}, diversity: {diversity_mean}')

## 3.2 User profile

In [None]:
# Max-Mix [-1,1]
df = DataFrame(UF[3]).T
user_max= df.max()
user_min= df.min()
df[df>0]=df[df>0]/user_max
df[df<0]=-df[df<0]/user_min
df_U_F=df.T

In [None]:
def user_profile(U_F, user):
    feature=np.array((U_F.columns))
    uf = U_F.iloc[user,:]
    idx=uf.values.argsort()[::-1]
    sorted_feature=feature[idx]
    sorted_uf=sorted(uf, reverse=True)
    df = pd.DataFrame({'Feature':sorted_feature, 'Params':sorted_uf})
    return df

def user_prefered_feature(df_feature, User, user_id):
    u=User.merge(df_feature[['feature', 'type']], how = 'inner',left_on='Feature', right_on = 'feature')
    u.drop(['feature'],axis=1,inplace=True)
    fav_feature=u[u['Params']>0]
    #fav_feature.drop(['feature'],inplace=True)
    fav_feature['user_id']=user_id
    return fav_feature
    

In [None]:
all_user_prefered = DataFrame()
feature=pd.read_excel('feature3.xlsx')
for i in range(df_U_F.shape[0]):
    User=user_profile(df_U_F, i)
    fav=user_prefered_feature(feature, User, i)
    all_user_prefered=pd.concat([all_user_prefered,fav], axis=0)

In [None]:
def plot_users_prefered_feature(user_id,all_user_prefered ):
    x=all_user_prefered[all_user_prefered.user_id==user_id].Feature
    y=all_user_prefered[all_user_prefered.user_id==user_id].Params
    plt.figure(figsize=(10,4))
    plt.plot(x[:10], y[:10],marker='o', alpha=0.5, linewidth=1.5, label=user_id)
    plt.legend()
    plt.xlabel('Feature') 
    plt.ylabel('Params')
    plt.title(f'Preferred features: user{user_id}')
    plt.savefig(f'user={user_id}.png')
    plt.show()
    return

In [None]:
plot_users_prefered_feature(2,all_user_prefered)