# **Install modules**

In [None]:
!pip install catboost



# **Add modules**

In [None]:
import numpy as np
import pandas as pd

from catboost import *
from sklearn.metrics.pairwise import paired_distances
from sklearn.model_selection import train_test_split

# **Load data**

In [None]:
train_path = '/content/drive/My Drive/Colab Notebooks/made-task-2/train.csv'
train_data = pd.read_csv(train_path)

item_features_path = '/content/drive/My Drive/Colab Notebooks/made-task-2/item-features.csv'
item_features_data = pd.read_csv(item_features_path)

user_features_path = '/content/drive/My Drive/Colab Notebooks/made-task-2/user-features.csv'
user_features_data = pd.read_csv(user_features_path)

test_path = '/content/drive/My Drive/Colab Notebooks/made-task-2/test.csv'
test_data = pd.read_csv(test_path)

# **Preprocess data**

In [None]:
train_data = train_data.drop('timestamp', axis=1)

In [None]:
test_data = test_data.drop('timestamp', axis=1)

In [None]:
user_features_data = user_features_data[['user_id', '0']]
user_features_data.rename(columns={'0': 'user_f'}, inplace=True)

In [None]:
item_features_data = item_features_data.drop(['9', '19', '27', '30'], axis=1)

# **Create train_df**

In [None]:
# Unique items: 10, 11, 13, 17, 21, 23, 26, 29, 37, 39, 71, 100, and all others
item_features_data.loc[:, 'label'] = 0
item_features_data.loc[item_features_data['item_id'] == 10, 'label'] = 1
item_features_data.loc[item_features_data['item_id'] == 11, 'label'] = 2
item_features_data.loc[item_features_data['item_id'] == 13, 'label'] = 3
item_features_data.loc[item_features_data['item_id'] == 17, 'label'] = 4
item_features_data.loc[item_features_data['item_id'] == 21, 'label'] = 5
item_features_data.loc[item_features_data['item_id'] == 23, 'label'] = 6
item_features_data.loc[item_features_data['item_id'] == 26, 'label'] = 7
item_features_data.loc[item_features_data['item_id'] == 29, 'label'] = 8
item_features_data.loc[item_features_data['item_id'] == 37, 'label'] = 9
item_features_data.loc[item_features_data['item_id'] == 39, 'label'] = 10
item_features_data.loc[item_features_data['item_id'] == 71, 'label'] = 11
item_features_data.loc[item_features_data['item_id'] == 100, 'label'] = 12

In [None]:
# Create new feature: size (length of feature vector in feature space)
tmp = pd.DataFrame(0, index=np.arange(item_features_data.shape[0]), columns=item_features_data.drop(['item_id', 'label'], axis=1).columns)
item_features_data['size'] = paired_distances(item_features_data.drop(['item_id', 'label'], axis=1), tmp)

In [None]:
# Merge train_data, user_features_data, and item_features_data
interactions_items = pd.merge(train_data, item_features_data, on='item_id')
interactions_items_users = pd.merge(interactions_items, user_features_data, on='user_id')

In [None]:
# Create new features: avg_rating and total_likes
ratings_mean_count = pd.DataFrame(interactions_items_users.groupby('item_id')['like'].mean())
ratings_mean_count.rename(columns={'like': 'avg_rating'}, inplace=True)
ratings_mean_count['total_likes'] = pd.DataFrame(interactions_items_users.groupby('item_id')['like'].count())
#ratings_mean_count.head()

In [None]:
# Merge interactions_items_users and ratings_mean_count (new features)
train_df = pd.merge(interactions_items_users, ratings_mean_count, on='item_id')
train_df.rename(columns={'like': 'known_like'}, inplace=True)

In [None]:
train_df.head()

Unnamed: 0,user_id,item_id,known_like,0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,28,29,31,label,size,user_f,avg_rating,total_likes
0,140,342,0,0.000348,-0.000787,-0.000735,0.001043,-0.000249,0.000343,0.000405,0.000333,-0.001516,0.001709,0.000145,0.001145,0.000652,0.000386,-0.00105,-8.9e-05,-0.000838,-0.000444,-0.000371,-0.001591,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,0.000211,-0.000584,0.000149,0,0.004599,0.0013,0.0,6
1,32,342,0,0.000348,-0.000787,-0.000735,0.001043,-0.000249,0.000343,0.000405,0.000333,-0.001516,0.001709,0.000145,0.001145,0.000652,0.000386,-0.00105,-8.9e-05,-0.000838,-0.000444,-0.000371,-0.001591,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,0.000211,-0.000584,0.000149,0,0.004599,0.000602,0.0,6
2,34,342,0,0.000348,-0.000787,-0.000735,0.001043,-0.000249,0.000343,0.000405,0.000333,-0.001516,0.001709,0.000145,0.001145,0.000652,0.000386,-0.00105,-8.9e-05,-0.000838,-0.000444,-0.000371,-0.001591,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,0.000211,-0.000584,0.000149,0,0.004599,0.000491,0.0,6
3,350,342,0,0.000348,-0.000787,-0.000735,0.001043,-0.000249,0.000343,0.000405,0.000333,-0.001516,0.001709,0.000145,0.001145,0.000652,0.000386,-0.00105,-8.9e-05,-0.000838,-0.000444,-0.000371,-0.001591,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,0.000211,-0.000584,0.000149,0,0.004599,0.000919,0.0,6
4,146,342,0,0.000348,-0.000787,-0.000735,0.001043,-0.000249,0.000343,0.000405,0.000333,-0.001516,0.001709,0.000145,0.001145,0.000652,0.000386,-0.00105,-8.9e-05,-0.000838,-0.000444,-0.000371,-0.001591,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,0.000211,-0.000584,0.000149,0,0.004599,0.000777,0.0,6


In [None]:
# Rearrange columns in train_df
cols =  ['user_id', 'user_f', 'item_id', 'avg_rating',
         'total_likes', 'size', 'label', 'known_like']

In [None]:
train_df = train_df[cols]
train_df.head()

Unnamed: 0,user_id,user_f,item_id,avg_rating,total_likes,size,label,known_like
0,140,0.0013,342,0.0,6,0.004599,0,0
1,32,0.000602,342,0.0,6,0.004599,0,0
2,34,0.000491,342,0.0,6,0.004599,0,0
3,350,0.000919,342,0.0,6,0.004599,0,0
4,146,0.000777,342,0.0,6,0.004599,0,0


In [None]:
# Can't have ids in final dataframe, so drop them
train_df = train_df.drop(['user_id', 'item_id'], axis=1)
# Can't trust high ratings with low ratings count
train_df.loc[train_df['total_likes'] < 10, 'avg_rating'] = None
train_df.head()

Unnamed: 0,user_f,avg_rating,total_likes,size,label,known_like
0,0.0013,,6,0.004599,0,0
1,0.000602,,6,0.004599,0,0
2,0.000491,,6,0.004599,0,0
3,0.000919,,6,0.004599,0,0
4,0.000777,,6,0.004599,0,0


# **Create test_df**

In [None]:
# Follow the same logic as for train_df
test_df = pd.DataFrame(index=np.arange(item_features_data.shape[0]), columns=['user_f'])
tmp = pd.merge(item_features_data, ratings_mean_count, on='item_id').sort_values('item_id')
test_df = pd.concat([test_df, tmp], axis=1)

In [None]:
# Rearrange columns in test_df
cols =  ['user_f', 'item_id', 'avg_rating', 'total_likes', 'size', 'label']

In [None]:
test_df = test_df[cols]

In [None]:
test_df.set_index('item_id', inplace=True)
test_df.index.name = None
test_df = test_df.sort_index()
test_df.loc[test_df['total_likes'] < 10, 'avg_rating'] = None
test_df.head()

Unnamed: 0,user_f,avg_rating,total_likes,size,label
0,,0.142857,14,0.018964,0
1,,,6,0.032847,0
2,,0.5,18,0.024769,0
3,,,7,0.018398,0
4,,0.25,28,0.022997,0


In [None]:
test_df_lst = []
for user_id in user_features_data['user_id'].values:
    tmp = test_df.copy()
    tmp.loc[:, 'user_f'] = user_features_data.loc[user_features_data['user_id'] == user_id]['user_f'].values[0]
    test_df_lst.append(tmp)
test_df_lst[0].head()

Unnamed: 0,user_f,avg_rating,total_likes,size,label
0,0.000695,0.142857,14,0.018964,0
1,0.000695,,6,0.032847,0
2,0.000695,0.5,18,0.024769,0
3,0.000695,,7,0.018398,0
4,0.000695,0.25,28,0.022997,0


# **Build Catboost model**

In [None]:
X = np.array(train_df.drop('known_like', axis=1))
y = np.ravel(train_df['known_like'])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
model = CatBoostClassifier(iterations=1500,
                           learning_rate=0.01,
                           l2_leaf_reg=9.0,
                           depth=10,
                           rsm=0.5,
                           loss_function='Logloss',
                           logging_level='Silent',
                           use_best_model=True,
                           random_state=123)
#cv_data = cv(Pool(X, y),
#             model.get_params(),
#             fold_count=5)
#print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Logloss-mean'])))
#cv_data

In [None]:
model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid)
    )

<catboost.core.CatBoostClassifier at 0x7fb8a652f550>

# **Generate output**

In [None]:
output = pd.DataFrame(columns=(['user_id'] + [i for i in range(20)]))

for id in test_data['user_id'].values:
    k = 0
    res = []
    predictions = model.predict_proba(test_df_lst[user_id])[:, 1]
    preds_df = pd.DataFrame(predictions)
    preds_df = preds_df.sort_values(0, ascending=False)
    for rec in preds_df.index.values:
        if rec not in train_data.loc[train_data['user_id'] == id]['item_id'].values:
            res.append(rec)
            k += 1
            if k == 20:
                break
    output.loc[id] = [id] + list(res)

In [None]:
output.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
166,166,76,37,35,22,60,65,66,80,11,21,58,7,5,36,40,72,146,87,78,33
26,26,76,37,35,22,60,65,66,80,11,21,58,7,5,36,40,72,146,87,78,67
41,41,76,37,35,22,60,65,66,80,11,21,58,5,36,40,72,146,87,78,67,33
286,286,76,37,35,22,60,65,66,80,11,21,58,7,5,36,72,146,87,78,67,33
108,108,76,37,35,22,60,66,80,11,21,58,7,5,36,40,72,146,87,78,67,33


In [None]:
#output.to_csv('/content/drive/My Drive/Colab Notebooks/made-task-2/submission.csv', index=False)