# 🐂 Sistema de Recomendación Multi-Objetivo 🐂
vamos a tratar de crear un sistema de recomendación de items usando el modelo"Collaborative-Filtering"

## Primero importamos las librerias

In [47]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
import time

## Cargamos los datos

In [2]:
df = pd.read_csv('/kaggle/input/bullbull/chunk.csv')
df.head()

Unnamed: 0,session,aid,timestamp,even_type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
77808827,1998,402071,1661663734719,clicks
77808828,1998,162703,1661663766015,clicks
77808829,1999,877858,1659304839347,clicks
77808830,1999,1779098,1660842975043,clicks


In [40]:
#filtramos los datos duplicados
fdf=df.drop(columns='timestamp')
fdf=df.drop_duplicates(['aid','session','even_type'])
fdf

Unnamed: 0,session,aid,timestamp,even_type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
77735251,1998,402071,1661663734719,clicks
77735252,1998,162703,1661663766015,clicks
77808829,1999,877858,1659304839347,clicks
77808830,1999,1779098,1660842975043,clicks


In [41]:
#agregamos nuevo tipo de rating
fdf['even_type']=fdf['even_type'].replace(['clicks','carts','orders'],[1,2,3])
fdf.rename(columns={'even_type':'rating'},inplace=True)
fdf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,session,aid,timestamp,rating
0,0,1517085,1659304800025,1
1,0,1563459,1659304904511,1
2,0,1309446,1659367439426,1
3,0,16246,1659367719997,1
4,0,1781822,1659367871344,1


## Creamos al usuario 0
Vamos a usar cada session como usuarios independientes, ya que el sistema guarda las sesiones de forma anónima.

In [42]:
user0=fdf.loc[fdf['session']==0]
user0

Unnamed: 0,session,aid,timestamp,rating
0,0,1517085,1659304800025,1
1,0,1563459,1659304904511,1
2,0,1309446,1659367439426,1
3,0,16246,1659367719997,1
4,0,1781822,1659367871344,1
...,...,...,...,...
271,0,843110,1661684298768,1
272,0,938007,1661684355390,1
273,0,1228848,1661684528943,1
274,0,1740927,1661684942173,1


## Buscamos las sesiones con aid similares

In [43]:
snts=fdf[fdf['aid'].isin(user0["aid"].values.tolist())]
snts=snts.loc[snts['session']!=0]
snts

Unnamed: 0,session,aid,timestamp,rating
7728,11,1110548,1659901731590,1
7729,11,1145803,1659901833217,1
7731,11,1145803,1659901876053,2
7737,11,1145803,1659902394985,3
29537,24,1498443,1659741855901,1
...,...,...,...,...
76064661,1975,321397,1660250174592,2
77223207,1991,1813509,1660111596592,1
77223208,1991,1813509,1660111604755,2
77223210,1991,1840418,1660200931078,1


In [44]:
#Groupby Crea varios subdataframes en el que todos tienen las mismas columnas
sntsGroup = snts.groupby(['session'])
#vemos uno de los sessions, por ejemplo el 11
sntsGroup.get_group(11)

Unnamed: 0,session,aid,timestamp,rating
7728,11,1110548,1659901731590,1
7729,11,1145803,1659901833217,1
7731,11,1145803,1659901876053,2
7737,11,1145803,1659902394985,3


In [45]:
#ordenar por los usuarios con más cantidad de articles en común
sntsGroup = sorted(sntsGroup,  key=lambda x: len(x[1]), reverse=True)
sntsGroup[0:3]

[(1663,
            session      aid      timestamp  rating
  57379340     1663   851778  1659371613493       1
  57379363     1663   442293  1659372636907       1
  57379364     1663   702179  1659372676234       1
  57379372     1663   856506  1659372902824       1
  57379385     1663   974651  1659377285194       1
  57379421     1663   394655  1659649812731       1
  57379460     1663  1469630  1659971909085       1),
 (813,
            session      aid      timestamp  rating
  27442007      813   667924  1659304813248       1
  27442010      813  1693461  1659340669648       1
  27442043      813  1650637  1659690576981       1
  27442137      813   959208  1660923463705       1
  27442160      813   171982  1661606458900       1
  27442161      813   171982  1661606470013       2),
 (1182,
            session      aid      timestamp  rating
  42423110     1182  1537907  1660206443685       1
  42423121     1182  1070142  1660207098719       1
  42423122     1182  1070142  1660207

In [48]:
#Guardar el pearson correlation en un diccionario,en el que el key es el User ID y el value sería el coefficient
pearsonCorrelationDict = {}
from numpy import sqrt 

#va a iterar por cada sntsgroup en el subset

for name, group in sntsGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='aid')
    user0df = user0.sort_values(by='aid')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = user0[user0['aid'].isin(group['aid'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
#Esta iteración va a botar las similitudes con cada session, lo malo es que como todos tienen rating 5, se normalizan las datas y se convierten en 1.
#al final tendríamos todos los similarity index en 0 hasta que le pongamos un valor valido en rating

In [49]:
pearsonCorrelationDict.items()

dict_items([(1663, 0), (813, 0.20000000000000032), (1182, -0.24999999999999972), (1186, 0.7559289460184544), (1738, 0.4629100498862757), (80, -0.9185586535436918), (253, 0), (652, 0.6123724356957947), (1048, 0.2500000000000003), (1215, 0.6123724356957947), (1385, -0.16666666666666688), (1643, 0.2500000000000003), (1669, 0.2500000000000003), (11, -0.9045340337332909), (546, 0), (558, -0.9045340337332909), (959, 0), (1018, -1.0), (1230, -0.9045340337332909), (1269, 0.3333333333333333), (1535, -1.0), (1597, 0.3333333333333333), (1737, -1.0), (1766, 0.3333333333333333), (1822, 0.3333333333333333), (1830, 0.0), (1991, 0.0), (24, 0), (90, -0.8660254037844385), (94, 0), (135, -0.8660254037844385), (137, 0), (263, 0.5000000000000001), (491, 0), (518, -0.9999999999999996), (643, -0.8660254037844385), (741, 1.0), (764, 0), (887, -0.9999999999999996), (894, -0.9999999999999996), (927, -0.8660254037844385), (939, -0.9999999999999996), (971, 0), (1025, 0.5000000000000001), (1173, 0), (1184, 0), (14

In [50]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['session'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,session
0,0.0,1663
1,0.2,813
2,-0.25,1182
3,0.755929,1186
4,0.46291,1738


## Top X similar sessions al usuario 0
vamos a conseguir el top 50 de los usuarios con mayor similarity index al input

In [51]:
#Conseguir el top 50 de sessions similares al input
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,session
70,1.0,614
36,1.0,741
3,0.755929,1186
7,0.612372,652
9,0.612372,1215


Ahora vamos a empezar a recomendar aids al user 0

## Cómo elegimos el orden de las recomendaciones
Hacemos un merge de similarity index con la tabla que tenga los ratings y los vamos a multiplicar para conseguir primero los ratings más altos.
vamos a suponer que ya realizamos la normalización de los ratings antes (es mentira, no lo hicimos, pero fé que funciona igual).
y debería salir con más peso los aids que tengan mayor relación ratin/similarity index

In [52]:
topUsersRating=topUsers.merge(fdf, left_on='session', right_on='session', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,session,aid,timestamp,rating
0,1.0,614,953931,1659304810222,1
1,1.0,614,230892,1659304828908,1
2,1.0,614,1797353,1659304871127,1
3,1.0,614,607671,1659304893022,1
4,1.0,614,1258280,1659304987852,1


In [53]:
#Multiplicamos el similarity index por el rating del usuario
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,session,aid,timestamp,rating,weightedRating
0,1.0,614,953931,1659304810222,1,1.0
1,1.0,614,230892,1659304828908,1,1.0
2,1.0,614,1797353,1659304871127,1,1.0
3,1.0,614,607671,1659304893022,1,1.0
4,1.0,614,1258280,1659304987852,1,1.0


In [58]:
#Los sumamos, no entiendo muy bien porqué pero lo vi en internet 
tempTopUsersRating = topUsersRating.groupby('aid').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
114,0.612372,0.612372
1178,0.0,0.0
1559,0.0,0.0
1830,0.0,0.0
2298,0.46291,0.46291


In [55]:
#creamos un df
recommendation_df = pd.DataFrame()
#insertamos el promedio de los weight
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['aid'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,aid
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
114,1.0,114
1178,,1178
1559,,1559
1830,,1830
2298,1.0,2298


In [56]:
#Hacemos un sort y debería salir el top de las recomendaciones basados en los usuarios similares y sus ratings

recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,aid
aid,Unnamed: 1_level_1,Unnamed: 2_level_1
285270,2.5,285270
274477,2.5,274477
1627616,2.5,1627616
1136878,2.5,1136878
1153909,2.0,1153909
1282013,2.0,1282013
855399,2.0,855399
188896,2.0,188896
463679,2.0,463679
871239,2.0,871239
