# 🐂 Sistema de Recomendación Multi-Objetivo 🐂
vamos a tratar de crear un sistema de recomendación de items usando el modelo"Collaborative-Filtering"

## Primero importamos las librerias

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
import time

## Cargamos los datos

In [2]:
df = pd.read_csv('/kaggle/input/bullbull/chunk.csv')
df.head()

Unnamed: 0,session,aid,timestamp,even_type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks


In [3]:
#filtramos los datos duplicados
fdf=df.drop(columns='timestamp')
fdf=df.drop_duplicates(['aid','session','even_type'])
fdf

Unnamed: 0,session,aid,timestamp,even_type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
77735251,1998,402071,1661663734719,clicks
77735252,1998,162703,1661663766015,clicks
77808829,1999,877858,1659304839347,clicks
77808830,1999,1779098,1660842975043,clicks


In [4]:
#Agregamos rating 5 de forma temporal, más adelante buscaremos una forma de calificar mejor
fdf = fdf.assign(rating=5)
fdf

Unnamed: 0,session,aid,timestamp,even_type,rating
0,0,1517085,1659304800025,clicks,5
1,0,1563459,1659304904511,clicks,5
2,0,1309446,1659367439426,clicks,5
3,0,16246,1659367719997,clicks,5
4,0,1781822,1659367871344,clicks,5
...,...,...,...,...,...
77735251,1998,402071,1661663734719,clicks,5
77735252,1998,162703,1661663766015,clicks,5
77808829,1999,877858,1659304839347,clicks,5
77808830,1999,1779098,1660842975043,clicks,5


## Creamos al usuario 0
Vamos a usar cada session como usuarios independientes, ya que el sistema guarda las sesiones de forma anónima.

In [5]:
user0=fdf.loc[fdf['session']==0]
user0

Unnamed: 0,session,aid,timestamp,even_type,rating
0,0,1517085,1659304800025,clicks,5
1,0,1563459,1659304904511,clicks,5
2,0,1309446,1659367439426,clicks,5
3,0,16246,1659367719997,clicks,5
4,0,1781822,1659367871344,clicks,5
...,...,...,...,...,...
271,0,843110,1661684298768,clicks,5
272,0,938007,1661684355390,clicks,5
273,0,1228848,1661684528943,clicks,5
274,0,1740927,1661684942173,clicks,5


## Buscamos las sesiones con aid similares

In [6]:
snts=fdf[fdf['aid'].isin(user0["aid"].values.tolist())]
snts=snts.loc[snts['session']!=0]
snts

Unnamed: 0,session,aid,timestamp,even_type,rating
7728,11,1110548,1659901731590,clicks,5
7729,11,1145803,1659901833217,clicks,5
7731,11,1145803,1659901876053,carts,5
7737,11,1145803,1659902394985,orders,5
29537,24,1498443,1659741855901,clicks,5
...,...,...,...,...,...
76064661,1975,321397,1660250174592,carts,5
77223207,1991,1813509,1660111596592,clicks,5
77223208,1991,1813509,1660111604755,carts,5
77223210,1991,1840418,1660200931078,clicks,5


In [7]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
sntsGroup = snts.groupby(['session'])
#vemos uno de los sessions, por ejemplo el 11
sntsGroup.get_group(11)

Unnamed: 0,session,aid,timestamp,even_type,rating
7728,11,1110548,1659901731590,clicks,5
7729,11,1145803,1659901833217,clicks,5
7731,11,1145803,1659901876053,carts,5
7737,11,1145803,1659902394985,orders,5


In [8]:
#Sorting it so users with movie most in common with the input will have priority
sntsGroup = sorted(sntsGroup,  key=lambda x: len(x[1]), reverse=True)
sntsGroup[0:3]

[(1663,
            session      aid      timestamp even_type  rating
  57379340     1663   851778  1659371613493    clicks       5
  57379363     1663   442293  1659372636907    clicks       5
  57379364     1663   702179  1659372676234    clicks       5
  57379372     1663   856506  1659372902824    clicks       5
  57379385     1663   974651  1659377285194    clicks       5
  57379421     1663   394655  1659649812731    clicks       5
  57379460     1663  1469630  1659971909085    clicks       5),
 (813,
            session      aid      timestamp even_type  rating
  27442007      813   667924  1659304813248    clicks       5
  27442010      813  1693461  1659340669648    clicks       5
  27442043      813  1650637  1659690576981    clicks       5
  27442137      813   959208  1660923463705    clicks       5
  27442160      813   171982  1661606458900    clicks       5
  27442161      813   171982  1661606470013     carts       5),
 (1182,
            session      aid      timestamp

In [9]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in sntsGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='aid')
    user0df = user0.sort_values(by='aid')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = user0[user0['aid'].isin(group['aid'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
#Esta iteración va a botar las similitudes con cada session, lo malo es que como todos tienen rating 5, se normalizan las datas y se convierten en 1.
#al final tendríamos todos los similarity index en 0 hasta que le pongamos un valor valido en rating

In [10]:
pearsonCorrelationDict.items()

dict_items([(1663, 0), (813, 0), (1182, 0), (1186, 0), (1738, 0), (80, 0), (253, 0), (652, 0), (1048, 0), (1215, 0), (1385, 0), (1643, 0), (1669, 0), (11, 0), (546, 0), (558, 0), (959, 0), (1018, 0), (1230, 0), (1269, 0), (1535, 0), (1597, 0), (1737, 0), (1766, 0), (1822, 0), (1830, 0), (1991, 0), (24, 0), (90, 0), (94, 0), (135, 0), (137, 0), (263, 0), (491, 0), (518, 0), (643, 0), (741, 0), (764, 0), (887, 0), (894, 0), (927, 0), (939, 0), (971, 0), (1025, 0), (1173, 0), (1184, 0), (1411, 0), (1420, 0), (1484, 0), (1625, 0), (1642, 0), (1646, 0), (1735, 0), (1771, 0), (1889, 0), (1919, 0), (1949, 0), (37, 0), (65, 0), (72, 0), (128, 0), (203, 0), (305, 0), (316, 0), (336, 0), (358, 0), (384, 0), (448, 0), (452, 0), (561, 0), (614, 0), (631, 0), (642, 0), (664, 0), (785, 0), (796, 0), (807, 0), (868, 0), (886, 0), (903, 0), (917, 0), (942, 0), (955, 0), (957, 0), (1017, 0), (1021, 0), (1024, 0), (1046, 0), (1072, 0), (1077, 0), (1233, 0), (1264, 0), (1304, 0), (1312, 0), (1330, 0), (1

In [11]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['session'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

#Sale 0 por el hecho de que sólo existen 5 de rating, se explica mejor en la iteración creada arriba

Unnamed: 0,similarityIndex,session
0,0,1663
1,0,813
2,0,1182
3,0,1186
4,0,1738


In [12]:
#Conseguir el top 50 de sessions similares al input
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,session
0,0,1663
184,0,800
170,0,606
171,0,663
172,0,675


In [13]:
topUsersRating=topUsers.merge(fdf, left_on='session', right_on='session', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,session,aid,timestamp,even_type,rating
0,0,1663,1095105,1659304830654,clicks,5
1,0,1663,931369,1659304883366,clicks,5
2,0,1663,1082983,1659304931429,clicks,5
3,0,1663,1561450,1659304952343,clicks,5
4,0,1663,25082,1659305007542,clicks,5


In [14]:
#Multiplicamos el similarity index por el rating del usuario
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,session,aid,timestamp,even_type,rating,weightedRating
0,0,1663,1095105,1659304830654,clicks,5,0
1,0,1663,931369,1659304883366,clicks,5,0
2,0,1663,1082983,1659304931429,clicks,5,0
3,0,1663,1561450,1659304952343,clicks,5,0
4,0,1663,25082,1659305007542,clicks,5,0


In [15]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('aid').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

#Va a salir todo 0 porque se multiplica por 0 XD

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
1178,0,0
1415,0,0
1473,0,0
1830,0,0
3382,0,0


In [16]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['aid'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,aid
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
1178,,1178
1415,,1415
1473,,1473
1830,,1830
3382,,3382


In [17]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,aid
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
1178,,1178
1415,,1415
1473,,1473
1830,,1830
3382,,3382
3628,,3628
3663,,3663
3979,,3979
4202,,4202
4203,,4203
