In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [2]:
import os
os.chdir("/content/gdrive/My Drive/made_contest")

In [3]:
import pandas as pd
import numpy as np

test_path = 'test.csv'
train_path = 'train.csv'
user_features_path = 'user-features.csv'
item_features_path = 'item-features.csv'

df_items = pd.read_csv(item_features_path)
df_users = pd.read_csv(user_features_path)
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_users = df_users.sort_values(by=['user_id'])
df_items = df_items.sort_values(by=['item_id'])

# lightfm need interaction matrix where greater rating means
# better users attitude, zero rating means that user has not
# met item before, so likes should be transform in rating:
# like -> 5, dislike -> 1
df_mod = df_train
df_mod['rating'] = np.where(df_train['like'] == 1, 1, -1)
df_mod.head()

Unnamed: 0,user_id,item_id,like,timestamp,rating
0,140,342,0,1490936622,-1
1,378,172,1,1490936628,1
2,150,182,0,1490936650,-1
3,455,17,0,1490936704,-1
4,350,409,0,1490936735,-1


In [9]:
#!pip3 install lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sqlalchemy.util import NoneType
import json
import itertools


class Preprocessor:
    def __init__(
            self,
            users_dataframe=None,
            items_dataframe=None,
            interactions_dataframe=None,
            item_id_column=None,
            items_feature_columns=None,
            user_id_column=None,
            user_features_columns=None,
            interaction_column=None,
            fix_columns_names=True,
    ):
        """
        this class dedicated to preprocess the Dataframes , ready to be fed into the model
        :param users_dataframe: a dataframe contain  users
        :param items_dataframe: a dataframe contain  items
        :param interactions_dataframe: a dataframe contain ratings of items - users
        :param item_id_column:  name of items column
        :param items_feature_columns: items_feature_columns
        :param user_id_column:  name of users column
        :param user_features_columns: user_features_columns
        """

        self.fix_columns_names = fix_columns_names

        self.items_dataframe = None
        self.users_dataframe = None
        self.interactions_dataframe = None

        if not isinstance(users_dataframe, NoneType):
            self.add_users_dataframe(users_dataframe)
            self.user_id_column = user_id_column
            self.user_features_columns = user_features_columns

        if not isinstance(items_dataframe, NoneType):
            self.add_items_dataframe(items_dataframe)
            self.item_id_column = item_id_column
            self.items_feature_columns = items_feature_columns

        if not isinstance(interactions_dataframe, NoneType):
            self.add_interactions_dataframe(interactions_dataframe)
            self.interaction_column = interaction_column

    def get_data_status(self):
        return {
            "items_dataframe": self.get_dataframe_status(self.items_dataframe),
            "users_dataframe": self.get_dataframe_status(self.users_dataframe),
            "interactions_dataframe": self.get_dataframe_status(
                self.interactions_dataframe
            ),
        }

    @staticmethod
    def get_dataframe_status(data):
        try:
            return not data.empty
        except:
            return False

    def add_items_dataframe(self, items_dataframe):
        self.fix_headers(items_dataframe)
        self.items_dataframe = items_dataframe

    def add_users_dataframe(self, users_dataframe):
        self.fix_headers(users_dataframe)
        self.users_dataframe = users_dataframe

    def add_interactions_dataframe(self, interactions_dataframe):
        self.fix_headers(interactions_dataframe)
        self.interactions_dataframe = interactions_dataframe

    def fix_headers(self, data):
        data.columns = (
            [x.replace("-", "_") for x in data.columns]
            if self.fix_columns_names
            else data.columns
        )
        return data

    @staticmethod
    def lowercase(dataframe):
        return dataframe.apply(lambda x: x.astype(str).str.lower())

    def get_unique_users(self):
        return self.get_uniques_from(self.users_dataframe, self.user_id_column)

    def get_unique_items(self):
        return self.get_uniques_from(self.items_dataframe, self.item_id_column)

    def get_unique_items_from_ratings(self):
        return self.serialize_list(
            self.get_uniques_from(self.interactions_dataframe, self.item_id_column)
        )

    def get_unique_users_from_ratings(self):
        return self.serialize_list(
            self.get_uniques_from(self.interactions_dataframe, self.user_id_column)
        )

    @staticmethod
    def get_uniques_from(dataframe, column):
        return dataframe[column].unique()

    def clean_unknown_interactions_func(self):
        """
        this function to remove all the  existing ratings with unknown items and users
        :return:
        """
        self.interactions_dataframe = self.interactions_dataframe[
            self.interactions_dataframe[self.item_id_column].isin(
                self.items_dataframe[self.item_id_column]
            )
        ]

        self.interactions_dataframe = self.interactions_dataframe[
            self.interactions_dataframe[self.user_id_column].isin(
                self.users_dataframe[self.user_id_column]
            )
        ]

    @staticmethod
    def serialize_list(in_list):
        return list(itertools.chain.from_iterable(in_list))

    def get_unique_items_features(self):
        return self.get_uniques_by_columns(
            self.items_dataframe, self.items_feature_columns
        )

    def get_unique_users_features(self):
        return self.get_uniques_by_columns(
            self.users_dataframe, self.user_features_columns
        )

    def get_uniques_by_columns(self, dataframe, columns):
        uniques = list()
        dataframe = dataframe.applymap(str)
        for col in columns:
            uniques.append(dataframe[col].unique())
        return self.serialize_list(uniques)

    def get_interactions_format(self):
        """
            Todo : it was a generator but light FM need the len (if len(datum) == 3) so i changed it to an array
        :return: iterable of (user_id, item_id, weight)
            An iterable of interactions. The user and item ids will be
            translated to internal model indices using the mappings
            constructed during the fit call
        """
        return [
            (
                row[self.user_id_column],
                row[self.item_id_column],
                np.float(row[self.interaction_column]),
            )
            for idx, row in self.interactions_dataframe.iterrows()
        ]

    @staticmethod
    def prepare_features_format(data, id, feature_columns):
        # for row in itertools.islice(data.iterrows(), 10): # for small dataset_helper_instance 10 .
        for row in data.iterrows():
            yield (row[1][id], [str(row[1][feature]) for feature in feature_columns])


class dataset_helper(Preprocessor):
    def __init__(
            self,
            users_dataframe=None,
            items_dataframe=None,
            interactions_dataframe=None,
            item_id_column="items",
            items_feature_columns: list = None,
            user_id_column="users",
            user_features_columns: list = None,
            interaction_column="interactions",
            clean_unknown_interactions=False,
            fix_columns_names=False,
    ):
        """
        :param users_dataframe:
        :param items_dataframe:
        :param interactions_dataframe:
        :param item_id_column:
        :param items_feature_columns:
        :param user_id_column:
        :param user_features_columns:
        :param interaction_column:
        :param clean_unknown_interactions:  remove unknown data ( what's in the interactions must be also in users
        and items dataframes )
        :param fix_columns_names:
        """
        self.data_ok = True

        super().__init__(
            users_dataframe,
            items_dataframe,
            interactions_dataframe,
            item_id_column,
            items_feature_columns,  # self.fix_headers_names(items_feature_columns)
            user_id_column,
            user_features_columns,  # self.fix_headers_names(user_features_columns),
            interaction_column,
            fix_columns_names,
        )
        if False in self.get_data_status().values():
            print(
                "[!] Warning ,There is some missing Dataframe {}".format(
                    self.get_data_status()
                )
            )
            self.data_ok = False
        else:
            if clean_unknown_interactions:
                self.clean_unknown_interactions_func()

            self.dataset = Dataset()
            self.item_features_list = None
            self.user_features_list = None

            self.done = False

    @staticmethod
    def fix_headers_names(data):
        try:
            return [i.replace("-", "_") for i in data]
        except TypeError:
            pass

    def dataset_fit(self):
        # building the dataset with features
        self.dataset.fit(
            users=self.get_unique_users(),
            items=self.get_unique_items(),
            item_features=self.get_unique_items_features(),
            user_features=self.get_unique_users_features(),
        )

    def build_interactions(self):
        (self.interactions, self.weights) = self.dataset.build_interactions(
            self.get_interactions_format()
        )

    def build_item_features(self):
        self.item_features_list = self.dataset.build_item_features(
            self.prepare_features_format(
                self.items_dataframe, self.item_id_column, self.items_feature_columns
            )
        )

    def build_user_features(self):
        self.user_features_list = self.dataset.build_user_features(
            self.prepare_features_format(
                self.users_dataframe, self.user_id_column, self.user_features_columns
            )
        )

    def get_all_mappings(self):
        return (
            self.dataset._user_id_mapping,
            self.dataset._user_feature_mapping,
            self.dataset._item_id_mapping,
            self.dataset._item_feature_mapping,
        )

    @staticmethod
    def get_metadata(_id, dataframe, desired_column):
        data = dataframe.loc[dataframe[desired_column] == _id]
        if len(data):
            return json.loads(data.to_json(orient="records"))

    def get_user_id_mapping(self):
        return self.dataset._user_id_mapping

    def get_item_id_mapping(self):
        return self.dataset._item_id_mapping

    def get_user_feature_mapping(self):
        return self.dataset._user_feature_mapping

    def get_item_feature_mapping(self):
        return self.dataset._item_feature_mapping

    def routine(self):
        if not self.data_ok:
            raise Exception("Missing Dataframe {}".format(self.get_data_status()))
        self.dataset_fit()
        self.build_interactions()
        self.build_user_features()
        self.build_item_features()
        self.done = True



In [10]:
def create_lfm_input(df, user_col, item_col, rating_col, use_features=False, verbose=True):

  dataset_helper_instance = dataset_helper(
    users_dataframe=df_users,
    items_dataframe=df_items,
    interactions_dataframe=df_mod,
    item_id_column=item_col,
    items_feature_columns=df_items.columns[1:],
    user_id_column=user_col,
    user_features_columns=df_users.columns[1:],
    interaction_column=rating_col,
    clean_unknown_interactions=True,
    fix_columns_names=False,
  )

  # run the routine 
  dataset_helper_instance.routine()
  
  return (dataset_helper_instance.interactions,
         dataset_helper_instance.weights,
         dataset_helper_instance.item_features_list,
         dataset_helper_instance.user_features_list)

In [11]:
inter, weights, items_arg, users_arg = create_lfm_input(df = df_mod,
                                                        user_col = 'user_id',
                                                        item_col = 'item_id',
                                                        rating_col = 'rating',
                                                        use_features=True)

In [12]:
model = LightFM(no_components=30, loss="warp", k=20)

model.fit(
    interactions=weights,
    item_features=items_arg,
    epochs=30,
    num_threads=4,
)

auc_score( 
    model, weights, 
    item_features=items_arg, 
    num_threads=4
).mean()

0.62626976

In [13]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def recommend(user, k,  model, item_features=None, user_features=None, verbose=False):
    df_use_for_prediction = pd.DataFrame(df_items['item_id'], columns=['item_id'])
    scores = model.predict(
        user,
        df_use_for_prediction['item_id'].values.tolist(),
        item_features=item_features,
        user_features=user_features)
        
    df_use_for_prediction['scores'] = scores
    df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:k]
    if verbose:
      print('User Id (' + str(user) + "): Recommended items: ")
      display(df_use_for_prediction)
    return df_use_for_prediction['item_id'].values
    

In [14]:
recommend( 166, 10, model, item_features=items_arg, user_features=None, verbose=True)

User Id (166): Recommended items: 


Unnamed: 0,item_id,scores
0,35,1.577095
34,76,1.455033
273,67,1.266779
12,22,1.21975
6,37,1.047866
413,21,0.821522
108,65,0.776767
3,36,0.759567
149,17,0.749606
24,72,0.730247


array([35, 76, 67, 22, 37, 21, 65, 36, 17, 72])

In [15]:
# compute user preferences timeline:
# {user, [(liked_item_0, time), ... ]}
user_preferences_timeline_dct = dict()
for index, row in df_mod[df_mod['rating'] == 1].iterrows():
    user_id, item_id, time = row['user_id'], row['item_id'], row['timestamp']
    if user_id not in user_preferences_timeline_dct:
        user_preferences_timeline_dct[user_id] = [(item_id, time)]
    else:
        user_preferences_timeline_dct[user_id].append((item_id, time))

In [16]:
def compute_test_answers(model, item_features=None, user_features=None, preferences_timeline=None):
  
  user_predicted_lists = dict()
  
  for index, row in df_test.iterrows():
    user_id, timestamp = row['user_id'], row['timestamp']

    # don't recommend items, which user has already liked (time[item_id] < timestamp)
    # use precomputed preferences_timeline for this purpuses
    blacklist_set = set()
    if preferences_timeline != None:
      blacklist_set = set([t[0] for t in preferences_timeline[user_id] if t[1] < timestamp])

    # get recommendations for user using model
    all_recommendations = recommend(user_id, 100, model, item_features, user_features)
    
    # erase blacklist items, which user has already seen
    user_predicted_lists[user_id] = [x for x in all_recommendations if x not in blacklist_set][:20]
    user_predicted_lists[user_id] = np.sort(user_predicted_lists[user_id])

    # save df
    df_submission = pd.DataFrame(user_predicted_lists).T
    df_submission.index.rename('user_id', inplace=True)

  return df_submission

In [17]:
df_subb = compute_test_answers(model, items_arg, None, preferences_timeline=user_preferences_timeline_dct)
df_subb.to_csv("submission_1608.csv")
df_subb

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
166,7,17,21,22,30,32,35,36,37,39,40,58,63,65,66,70,72,76,80,155
26,2,7,15,18,19,22,26,27,30,35,37,49,50,72,73,76,87,101,104,129
41,2,5,11,22,30,33,35,37,39,40,44,58,65,66,67,72,76,80,87,90
286,5,11,22,32,33,35,37,44,58,60,65,66,71,72,76,78,80,119,146,172
108,11,17,22,32,33,35,36,37,40,58,66,67,70,72,76,78,80,146,147,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,5,17,21,32,35,36,39,40,58,59,65,66,70,76,80,136,137,146,155,172
181,5,11,21,22,33,35,37,39,40,58,60,65,66,72,76,80,84,90,119,172
448,5,11,17,22,32,33,35,36,39,58,60,65,66,71,72,76,78,118,119,172
124,17,21,22,30,32,33,35,36,39,40,58,63,65,66,67,76,80,84,87,146
