## MTS Your Second RecSys competition

This notebook contains a solution to a RecSys competition which is a part of MTS Your Second RecSys course.

MARS library is primarily used in this solution.

The notebook includes:
* fitting a model using hot users data and calculating recommendations for hot users
* metrics evaluation using cross validation with time split
* calculating recommendations for cold users

Load datasets from https://ods.ai/tracks/recsys-course2021/competitions/competition-recsys-21/data 

In [1]:
!wget https://storage.yandexcloud.net/datasouls-ods/materials/7952fa0d/recsys_course.zip

!unzip recsys_course.zip

--2022-08-01 15:55:56--  https://storage.yandexcloud.net/datasouls-ods/materials/7952fa0d/recsys_course.zip
Распознаётся storage.yandexcloud.net (storage.yandexcloud.net)… 213.180.193.243
Подключение к storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 79057125 (75M) [application/zip]
Сохранение в: «recsys_course.zip»


2022-08-01 15:56:03 (10,4 MB/s) - «recsys_course.zip» сохранён [79057125/79057125]

Archive:  recsys_course.zip
  inflating: items.csv               
  inflating: RecSys notebook Baseline.ipynb  
  inflating: RecSys notebook EDA.ipynb  
  inflating: sample_submission.csv   
  inflating: users.csv               
  inflating: interactions.csv        


### Importing libraries

In [17]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import datetime
from implicit.nearest_neighbours import TFIDFRecommender

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import calc_metrics, MAP
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.model_selection import TimeRangeSplit


In [3]:
def fillna_cols(df):
    return df.fillna({'age':'age_unknown','sex':'sex_unknown', 'income': 'income_unknown','kids_flg': False}).copy()


### Loading data

In [4]:
users_df = pd.read_csv('users.csv')
items_df = pd.read_csv('items.csv')
interactions_df = pd.read_csv('interactions.csv', parse_dates=['last_watch_dt'])
submission = pd.read_csv('sample_submission.csv')

### Preprocessing

#### Users

In [5]:
users_df['age'] = users_df['age'].fillna('age_unknown')
users_df['age'] = users_df['age'].astype('category')

users_df['income'] = users_df['income'].fillna('income_unknown')
users_df['income'] = users_df['income'].astype('category')

users_df['sex'] = users_df['sex'].fillna('sex_unknown')
users_df.loc[users_df.sex == 'М', 'sex'] = 'M'
users_df.loc[users_df.sex == 'Ж', 'sex'] = 'F'
users_df['sex'] = users_df['sex'].astype('category')

users_df['kids_flg'] = users_df['kids_flg'].astype('bool')

In [6]:
users_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   user_id   840197 non-null  int64   
 1   age       840197 non-null  category
 2   income    840197 non-null  category
 3   sex       840197 non-null  category
 4   kids_flg  840197 non-null  bool    
dtypes: bool(1), category(3), int64(1)
memory usage: 9.6 MB


  users_df.info(verbose=True, null_counts=True)


#### Items 

In [7]:
items_df['content_type'] = items_df['content_type'].astype('category')

items_df['title'] = items_df['title'].str.lower()
items_df['title_orig'] = items_df['title_orig'].fillna('None') 

items_df.loc[items_df['release_year'].isna(), 'release_year'] = 2020.
items_df.loc[items_df['release_year'] < 1920, 'release_year_cat'] = 'inf_1920'
items_df.loc[items_df['release_year'] >= 2020, 'release_year_cat'] = '2020_inf'
for i in range (1920, 2020, 10):
    items_df.loc[(items_df['release_year'] >= i) & (items_df['release_year'] < i+10), 'release_year_cat'] = f'{i}-{i+10}'
items_df = items_df.drop(columns=['release_year'])
items_df['release_year_cat'] = items_df['release_year_cat'].astype('category')

items_df['genres'] = items_df['genres'].astype('category')

items_df.loc[items_df.countries.isna(), 'countries'] = 'Russia'
items_df['countries'] = items_df['countries'].str.lower()
items_df['countries'] = items_df['countries'].apply(lambda x: ', '.join(sorted(list(set(x.split(', '))))))

items_df['for_kids'] = items_df['for_kids'].fillna(0)
items_df['for_kids'] = items_df['for_kids'].astype('bool')

items_df.loc[items_df.age_rating.isna(), 'age_rating'] = 0
items_df['age_rating'] = items_df['age_rating'].astype('category')

items_df['studios'] = items_df['studios'].fillna('Unknown')
items_df['studios'] = items_df['studios'].str.lower()
items_df['studios'] = items_df['studios'].apply(lambda x: ', '.join(sorted(list(set(x.split(', '))))))
items_df['studios'] = items_df['studios'].astype('category')

items_df['directors'] = items_df['directors'].fillna('Unknown')
items_df['directors'] = items_df['directors'].str.lower()
items_df['directors'] = items_df['directors'].astype('category')

items_df['actors'] = items_df['actors'].fillna('Unknown')
items_df['actors'] = items_df['actors'].astype('category')

items_df['keywords'] = items_df['keywords'].fillna('Unknown')
items_df['keywords'] = items_df['keywords'].astype('category')

items_df['description'] = items_df['description'].fillna('-')

In [8]:
items_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15963 entries, 0 to 15962
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   item_id           15963 non-null  int64   
 1   content_type      15963 non-null  category
 2   title             15963 non-null  object  
 3   title_orig        15963 non-null  object  
 4   genres            15963 non-null  category
 5   countries         15963 non-null  object  
 6   for_kids          15963 non-null  bool    
 7   age_rating        15963 non-null  category
 8   studios           15963 non-null  category
 9   directors         15963 non-null  category
 10  actors            15963 non-null  category
 11  description       15963 non-null  object  
 12  keywords          15963 non-null  category
 13  release_year_cat  15963 non-null  category
dtypes: bool(1), category(8), int64(1), object(4)
memory usage: 2.4+ MB


  items_df.info(verbose=True, null_counts=True)


#### Interactions

In [9]:
interactions_df['watched_pct'] = interactions_df['watched_pct'].astype(pd.Int8Dtype())
interactions_df['watched_pct'] = interactions_df['watched_pct'].fillna(0)
interactions_df['last_watch_dt'] = pd.to_datetime(interactions_df['last_watch_dt'])

In [10]:
interactions_df.info(null_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   user_id        5476251 non-null  int64         
 1   item_id        5476251 non-null  int64         
 2   last_watch_dt  5476251 non-null  datetime64[ns]
 3   total_dur      5476251 non-null  int64         
 4   watched_pct    5476251 non-null  Int8          
dtypes: Int8(1), datetime64[ns](1), int64(3)
memory usage: 177.6 MB


  interactions_df.info(null_counts=True, verbose=True)


#### Save data sets

In [11]:
users_df.to_csv('users_processed.csv', index=False)
items_df.to_csv('items_processed.csv', index=False)
interactions_df.to_csv('interactions_processed.csv', index=False)

### Loading preprocess data

In [12]:
users_df = pd.read_csv('users_processed.csv')
items_df = pd.read_csv('items_processed.csv')
interactions_df = pd.read_csv('interactions_processed.csv', parse_dates=['last_watch_dt'])
submission = pd.read_csv('sample_submission.csv')

### Renaming columns and leaving the following columns - ['user_id', 'item_id', 'weight', 'datetime']

In [13]:
interactions_df.rename(
    {
        "user_id": Columns.User,
        "item_id": Columns.Item,
        "last_watch_dt": Columns.Datetime,
        "watched_pct": Columns.Weight,
    },
    axis=1,
    inplace=True
)
interactions_df = interactions_df[Columns.Interactions]
interactions_df.head()

Unnamed: 0,user_id,item_id,weight,datetime
0,176549,9506,72,2021-05-11
1,699317,1659,100,2021-05-29
2,656683,7107,0,2021-05-09
3,864613,7638,100,2021-07-05
4,964868,9506,100,2021-04-30


### Obtaining table with hot users data

In [14]:
df_user_gr = interactions_df.groupby(Columns.User)[Columns.Item].nunique().reset_index()
interactions_df_hot = interactions_df[(interactions_df[Columns.Weight] >= 5) &
                                      (interactions_df[Columns.User].isin(df_user_gr[df_user_gr[Columns.Item] >= 3][Columns.User].unique()))].reset_index()
test_hot = submission.merge(interactions_df_hot[[Columns.User]].drop_duplicates(), on = Columns.User, how = 'inner')

### Fitting TFIDF Recommender and calculating top 10 recommendations for hot users

In [18]:
dataset = Dataset.construct(interactions_df_hot)

top_N_fit = 400
top_N = 10

model = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=top_N_fit))
model.fit(dataset)
recs_hot = model.recommend(
    test_hot[Columns.User].unique(),
    dataset,
    k=top_N,
    filter_viewed=True,
)
recs_hot = recs_hot.groupby(Columns.User).agg({Columns.Item: list}).reset_index()
recs_hot.head()

Unnamed: 0,user_id,item_id
0,3,"[15297, 13865, 2657, 9996, 8636, 7417, 7829, 1..."
1,11,"[10440, 4151, 142, 13865, 9728, 3734, 9996, 86..."
2,30,"[13865, 10440, 157, 10419, 3734, 10525, 15297,..."
3,46,"[15297, 6785, 13865, 9728, 4151, 6210, 3734, 9..."
4,47,"[9728, 6774, 10440, 3734, 512, 7829, 7102, 447..."


### Metrics evaluation using cross validation with time split

In [19]:
last_date = interactions_df_hot[Columns.Datetime].max().normalize()
folds = 3
start_date = last_date - pd.Timedelta(days=folds*7)
date_range = pd.date_range(start_date, last_date)

cv = TimeRangeSplit(date_range)

validation_results = pd.DataFrame()
last_n_days = 60
metrics = {f"MAP@{top_N}": MAP(top_N)}

for train_idx, test_idx, info in cv.split(interactions_df_hot, collect_fold_stats=True):
    train = interactions_df_hot.loc[train_idx]
    
    date_window = train[Columns.Datetime].max() - pd.DateOffset(days=last_n_days)
    train = train[train[Columns.Datetime] >= date_window]
    train_dataset = Dataset.construct(train)
    model = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=top_N_fit))
    model.fit(train_dataset)
    test = interactions_df_hot.loc[test_idx]
    test_users = np.intersect1d(
        train_dataset.user_id_map.external_ids,
        test[Columns.User].unique()
    )
    recs = model.recommend(
        test_users,
        train_dataset,
        k=top_N,
        filter_viewed=True,
    )
    metrics_calculated = calc_metrics(
        metrics,
        recs,
        test,
        train,
    )
    validation_results = validation_results.append(metrics_calculated, ignore_index=True)

### Printing cross validated MAP@10 value 

In [20]:
validation_results.agg({f"MAP@{top_N}":"mean"})

MAP@10    0.082902
dtype: float64

### Obtaining table with cold users data

In [21]:
test_cold = submission.merge(test_hot, on = Columns.User, how = "left")
test_cold = test_cold[test_cold["item_id_y"].isnull()]
test_cold = test_cold.rename(columns = {"item_id_x" : Columns.Item})[[Columns.User, Columns.Item]]

### Getting data for cold user recommendations. 
### Recommendations will be based on the most popular items among sociodemographic factors (age and sex)

In [22]:
last_n_days = 21 
train = interactions_df
date_window = train[Columns.Datetime].max() - pd.DateOffset(days=last_n_days)
train_slice = pd.merge(train[train[Columns.Datetime] >= date_window], users_df, on=Columns.User, how="left")
train_slice = fillna_cols(train_slice)

cols_soc_dem = ["age", "sex"]
soc_dem_recommendations = train_slice.groupby(cols_soc_dem + [Columns.Item]).size().to_frame().reset_index().rename(columns = {0 : "count_pop"})
top_soc_dem = soc_dem_recommendations.groupby(cols_soc_dem).apply(lambda x: x.sort_values("count_pop", ascending=False)[Columns.Item].values[:top_N]).reset_index().rename(columns = {0 : Columns.Item})

top_soc_dem.head()

Unnamed: 0,age,sex,item_id
0,age_18_24,F,"[9728, 15297, 10440, 4151, 3734, 7793, 12192, ..."
1,age_18_24,M,"[9728, 10440, 7793, 15297, 13865, 3734, 4151, ..."
2,age_18_24,sex_unknown,"[9728, 10440, 15297, 4151, 13865, 12192, 3734,..."
3,age_25_34,F,"[15297, 10440, 9728, 4151, 3734, 12192, 13865,..."
4,age_25_34,M,"[9728, 10440, 15297, 7793, 13865, 3734, 12192,..."


### Obtaining recommendations for cold users

In [23]:
recs_cold = pd.DataFrame({Columns.User: test_cold[Columns.User].unique()})
recs_cold = pd.merge(recs_cold[[Columns.User]], users_df, on=Columns.User, how="left")
recs_cold = fillna_cols(recs_cold)

recs_cold = pd.merge(recs_cold, top_soc_dem, on = cols_soc_dem, how = "left")
recs_cold = recs_cold.drop(columns = ["income", "kids_flg", "age", "sex"])

recs_cold = recs_cold.explode(Columns.Item)
recs_cold[Columns.Rank] = recs_cold.groupby(Columns.User).cumcount() + 1
recs_cold = recs_cold.groupby(Columns.User).agg({Columns.Item: list}).reset_index()

recs_cold.head()

Unnamed: 0,user_id,item_id
0,29,"[10440, 15297, 9728, 4151, 12192, 13865, 2657,..."
1,33,"[10440, 15297, 9728, 4151, 12192, 13865, 2657,..."
2,39,"[10440, 15297, 9728, 4151, 12192, 13865, 2657,..."
3,66,"[9728, 10440, 15297, 13865, 7793, 12192, 11863..."
4,70,"[9728, 15297, 10440, 4151, 3734, 7793, 12192, ..."


### Concatenating recommendations for hot and cold users

In [24]:
recs_union = pd.concat([recs_hot, recs_cold], axis = 0)
recs_union = recs_union.sort_values(Columns.User)
recs_union = recs_union.reset_index(drop = True)


In [25]:
recs_union.head()

Unnamed: 0,user_id,item_id
0,3,"[15297, 13865, 2657, 9996, 8636, 7417, 7829, 1..."
1,11,"[10440, 4151, 142, 13865, 9728, 3734, 9996, 86..."
2,29,"[10440, 15297, 9728, 4151, 12192, 13865, 2657,..."
3,30,"[13865, 10440, 157, 10419, 3734, 10525, 15297,..."
4,33,"[10440, 15297, 9728, 4151, 12192, 13865, 2657,..."
