In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate as cv
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import cross_validate
import random
from datetime import datetime
from sklearn.model_selection import KFold
from collections import defaultdict
#Para garantizar reproducibilidad en resultados
seed = 10
random.seed(seed)
np.random.seed(seed)

In [73]:
def get_top5_recommendations(predictions, topN = 5):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [33]:
ratings=pd.read_csv('ml-25m/ratings.csv',sep=',')
movies= pd.read_csv('ml-25m/movies.csv')
ratings['timestamp'] = ratings['timestamp'].apply(datetime.fromtimestamp)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 10:34:04
1,1,306,3.5,2006-05-17 07:26:57
2,1,307,5.0,2006-05-17 07:27:08
3,1,665,5.0,2006-05-17 10:13:40
4,1,899,3.5,2006-05-17 07:21:50
...,...,...,...,...
25000090,162541,50872,4.5,2009-04-28 16:16:12
25000091,162541,55768,2.5,2009-04-28 15:53:18
25000092,162541,56176,2.0,2009-04-28 15:31:37
25000093,162541,58559,4.0,2009-04-28 16:17:14


In [34]:
movie_data_ratings_data=movies.merge(ratings,on = 'movieId',how = 'inner')
movie_data_ratings_data.head(3)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,2006-03-03 14:57:00
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,2015-08-13 08:23:35
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,2019-11-16 17:44:12


In [35]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(62423, 191)

#Cosine similarity

In [7]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.32107747, 0.06609706, 0.05824133],
       [0.32107747, 1.        , 0.        , 0.        ],
       [0.06609706, 0.        , 1.        , 0.36875378],
       [0.05824133, 0.        , 0.36875378, 1.        ]])

In [76]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Ver las recomendaciones top

In [9]:
genre_recommendations('Good Will Hunting (1997)').head(20)

24                            Leaving Las Vegas (1995)
27                                   Persuasion (1995)
34                                   Carrington (1995)
45                How to Make an American Quilt (1995)
48                        When Night Is Falling (1995)
73                                 Bed of Roses (1996)
82     Once Upon a Time... When We Were Colored (1995)
84                           Angels and Insects (1995)
103              Bridges of Madison County, The (1995)
129                           Frankie Starlight (1995)
138                       Up Close and Personal (1996)
177                                    Mad Love (1995)
180                     Moonlight and Valentino (1995)
189                         Scarlet Letter, The (1995)
200                               Total Eclipse (1995)
205                       Walk in the Clouds, A (1995)
213                              Before Sunrise (1995)
219                           Circle of Friends (1995)
246       

In [10]:
genre_recommendations('Toy Story (1995)').head(20)

2203                                           Antz (1998)
3021                                    Toy Story 2 (1999)
3653        Adventures of Rocky and Bullwinkle, The (2000)
3912                      Emperor's New Groove, The (2000)
4780                                 Monsters, Inc. (2001)
9949     DuckTales: The Movie - Treasure of the Lost La...
10773                                     Wild, The (2006)
11604                               Shrek the Third (2007)
12969                       Tale of Despereaux, The (2008)
17431    Asterix and the Vikings (Astérix et les Viking...
20015                                         Turbo (2013)
22286                                       Aladdin (1992)
22353                                Boxtrolls, The (2014)
22633            Toy Story Toons: Hawaiian Vacation (2011)
22634                    Toy Story Toons: Small Fry (2011)
23212                             The Magic Crystal (2011)
28245                                Brother Bear 2 (200

In [11]:
genre_recommendations('Saving Private Ryan (1998)').head(20)

460                         Heaven & Earth (1993)
1177                        Apocalypse Now (1979)
1201                 Boot, Das (Boat, The) (1981)
1838        All Quiet on the Western Front (1930)
1939                   Saving Private Ryan (1998)
2336                    Thin Red Line, The (1998)
2852                      Dirty Dozen, The (1967)
2969                      Longest Day, The (1962)
2973                     Tora! Tora! Tora! (1970)
3345                              Red Dawn (1984)
3421                Force 10 from Navarone (1978)
3544                 Fighting Seabees, The (1944)
3652                          Patriot, The (2000)
4852                    Behind Enemy Lines (2001)
4895                                Sahara (1943)
4904                       Black Hawk Down (2001)
5046                      We Were Soldiers (2002)
5312                           Windtalkers (2002)
5787                                  Zulu (1964)
5803    Victory (a.k.a. Escape to Victory) (1981)


In [12]:
## Filtrado colaborativo

In [38]:
# Fill NaN values in user_id and movie_id column with 0
ratings['userId'] = ratings['userId'].fillna(0)
ratings['movieId'] = ratings['movieId'].fillna(0)

# Replace NaN values in rating column with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 10:34:04
1,1,306,3.5,2006-05-17 07:26:57
2,1,307,5.0,2006-05-17 07:27:08
3,1,665,5.0,2006-05-17 10:13:40
4,1,899,3.5,2006-05-17 07:21:50
...,...,...,...,...
25000090,162541,50872,4.5,2009-04-28 16:16:12
25000091,162541,55768,2.5,2009-04-28 15:53:18
25000092,162541,56176,2.0,2009-04-28 15:31:37
25000093,162541,58559,4.0,2009-04-28 16:17:14


In [9]:
ratings=ratings.sort_values(by='timestamp')

In [45]:
reviews=ratings[['userId','movieId','rating','timestamp']].sort_values(by='timestamp').copy()
ratings['user_id_simple'] = pd.factorize(reviews.userId)[0]
ratings['movie_id_simple'] = pd.factorize(reviews.movieId)[0]
reviews['user_id_simple'] = pd.factorize(reviews.userId)[0]
reviews['movie_id_simple'] = pd.factorize(reviews.movieId)[0]
reviews.tail(10)

Unnamed: 0,userId,movieId,rating,timestamp,user_id_simple,movie_id_simple
13207853,85523,59315,4.0,2019-11-21 04:09:53,162540,9856
13207872,85523,122912,4.5,2019-11-21 04:09:59,162540,25055
13207855,85523,60069,4.5,2019-11-21 04:10:06,162540,9920
13207885,85523,195165,3.0,2019-11-21 04:10:24,162540,51847
13207868,85523,109487,4.5,2019-11-21 04:10:45,162540,21683
13207880,85523,168252,4.0,2019-11-21 04:11:19,162540,38256
1284508,8642,202101,2.0,2019-11-21 04:11:52,103213,55366
13207893,85523,204704,4.0,2019-11-21 04:12:13,162540,56598
1284464,8642,122914,4.0,2019-11-21 04:12:29,103213,23909
13207877,85523,149406,4.5,2019-11-21 04:15:03,162540,30127


In [11]:
reviews.shape
reviews

Unnamed: 0,userId,movieId,rating,user_id_simple,movie_id_simple
326761,2262,21,3.0,0,0
326810,2262,1079,3.0,0,1
326767,2262,47,5.0,0,2
15845015,102689,1,4.0,1,3
15845023,102689,39,5.0,1,4
...,...,...,...,...,...
13207880,85523,168252,4.0,162540,38256
1284508,8642,202101,2.0,103213,55366
13207893,85523,204704,4.0,162540,56598
1284464,8642,122914,4.0,103213,23909


In [12]:
# getting the number unique users and restaurants
unique_users = reviews.userId.unique().shape[0]
unique_restaurants = reviews.movieId.unique().shape[0]

reader = Reader( rating_scale = ( 0, 5 ) )
reviews.head()

Unnamed: 0,userId,movieId,rating,user_id_simple,movie_id_simple
326761,2262,21,3.0,0,0
326810,2262,1079,3.0,0,1
326767,2262,47,5.0,0,2
15845015,102689,1,4.0,1,3
15845023,102689,39,5.0,1,4


In [22]:
# Randomly sample dataset too big
small_data = reviews.sample(frac=0.0010)
# Check the sample info
small_data= small_data[['user_id_simple', 'movie_id_simple', 'rating']]
small_data

Unnamed: 0,user_id_simple,movie_id_simple,rating
4509235,83996,4091,3.0
13020005,14975,127,3.0
22300239,68127,2220,5.0
13900712,135958,237,4.0
19848246,36169,1159,3.0
...,...,...,...
8319143,146080,34609,3.0
21612408,29212,1166,4.0
1554911,9859,291,3.0
7175707,158194,81,5.0


In [66]:
surprise_dataset = Dataset.load_from_df( small_data[ ["user_id_simple","movie_id_simple","rating"] ], reader )
train_set, test_set=  train_test_split(surprise_dataset, test_size=.3)

In [71]:

sim_options = {'name': 'cosine',
               'user_based': True  # calcule similitud user_user
               }
for i in (3,50,100):
    algo1 = KNNBasic(k=i, min_k=3, sim_options=sim_options)
    algo1.fit(train_set)
    predictions1 = algo1.test(test_set)
    print(accuracy.rmse(predictions1))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0610
1.0610382495814323
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0610
1.0610382495814323
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0610
1.0610382495814323


In [81]:
top5_recommendations = get_top5_recommendations(predictions1)
#top5_recommendations
predictions1

[Prediction(uid=93394, iid=2981, r_ui=3.0, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=124070, iid=18289, r_ui=4.0, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=129000, iid=4257, r_ui=2.5, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=151119, iid=8541, r_ui=2.5, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=105878, iid=1131, r_ui=5.0, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid=99427, iid=1708, r_ui=3.0, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid=77228, iid=374, r_ui=3.5, est=3.5343428571428572, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Predicti

In [None]:
##Precision@k = (# of recommended items @k that are relevant) / (# of recommended items @k)
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
x1=[x.est for x in predictions]
x2=[x.est for x in predictions1]

## Insertar BD

In [None]:
# getting the number unique users and restaurants
unique_users = reviews_all.user_id.unique().shape[0]
unique_restaurants = reviews_all.business_id.unique().shape[0]
current_reviews= reviews_all[ ["user_id_simple","business_id_simple","stars"] ]
#Create two user-item matrices, one for training and another for testing
matrix_all = np.zeros((unique_users, unique_restaurants))

for line in current_reviews.itertuples():
    #print(line[1])
    matrix_all[line[1], line[2]] = line[3]
matrix_all

In [None]:
!pip install mysql
!pip install mysql.connector
!pip install mysql-connector-python-rf
import mysql.connector
from mysql.connector import errorcode
#Configuración de la conexión a Mysql
try:
  cnx = mysql.connector.connect(user='user_taller3', password='taller3.', host='127.0.0.1', database='taller3')
except mysql.connector.Error as err:
  if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
    print("Something is wrong with your user name or password")
  elif err.errno == errorcode.ER_BAD_DB_ERROR:
    print("Database does not exist")
  else:
    print(err)

cursor = cnx.cursor()

In [None]:
sql = "INSERT INTO `recomendations_movies` (`user_id`, `movies_id`, `recomendation_score`) VALUES (%s, %s, %s)"


for i in range(0,unique_users):
    user_id = reviews_all.loc[reviews_all.user_id_simple==i].user_id.unique()[0]
    for j in range(0, unique_restaurants):
        if matrix_all[i][j] == 0:    
            star2=algo1.predict(i, j, r_ui=None, verbose=True)
            #print(star2.est)
            star1=algo2.predict(i, j, r_ui=None, verbose=True)
            #print(star1.est)
            #print(star.est)
            if int(star1.est) == int(star2.est):
                business_id =reviews_all.loc[reviews_all.business_id_simple==j].business_id.unique()[0]
                #print(user_id, business_id)
                print("iguallll")
                cursor.execute(sql, (user_id,business_id,int(star1.est)))
                cnx.commit()