In [1]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds
import pandas as pd
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [2]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Merging the ratings and movie data on the movieId column

In [3]:
data = pd.merge(ratings, movies, on="movieId")
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama
...,...,...,...,...,...,...
105334,668,140098,2.5,1450415424,Runoff (2015),Drama
105335,668,140816,2.5,1443288791,Tangerine (2015),Comedy|Drama
105336,668,141472,2.5,1442679119,The 50 Year Argument (2014),(no genres listed)
105337,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [4]:
data.shape

(105339, 6)

In [5]:
data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


Checking for Null Values

In [6]:
data.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [7]:
data.nunique()

userId         668
movieId      10325
rating          10
timestamp    84686
title        10323
genres         938
dtype: int64

Counting the ratings of all movies

In [8]:
data.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Pulp Fiction (1994)                 325
Forrest Gump (1994)                 311
Shawshank Redemption, The (1994)    308
Jurassic Park (1993)                294
Silence of the Lambs, The (1991)    290
Name: rating, dtype: int64

Creating a dataframe with mean 'rating' and total number of ratings for each movie

In [9]:
ratings = pd.DataFrame(data.groupby('title')['rating'].mean())
ratings['Number of Ratings'] = pd.DataFrame(data.groupby('title')['rating'].count())
ratings.head()

Unnamed: 0_level_0,rating,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),3.5,1
'Hellboy': The Seeds of Creation (2004),3.0,1
'Round Midnight (1986),2.5,1
'Til There Was You (1997),4.0,3
"'burbs, The (1989)",3.125,20


Sorting values according to the 'num of rating column'

In [10]:
pivot_movie = data.pivot_table(index ='userId',columns ='title', values ='rating')
pivot_movie.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [11]:
ratings.sort_values('Number of Ratings', ascending = False).head(5)

Unnamed: 0_level_0,rating,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Pulp Fiction (1994),4.16,325
Forrest Gump (1994),4.138264,311
"Shawshank Redemption, The (1994)",4.454545,308
Jurassic Park (1993),3.659864,294
"Silence of the Lambs, The (1991)",4.194828,290


Analyzing correlation with similar movies

In [12]:
alice_user_ratings = pivot_movie['Pulp Fiction (1994)']
ttm_user_ratings = pivot_movie['Silence of the Lambs, The (1991)']
alice_user_ratings.head()

userId
1    4.0
2    NaN
3    5.0
4    4.0
5    NaN
Name: Pulp Fiction (1994), dtype: float64

Analyzing Correlation with Similar Movies

In [13]:
similar_to_alice = pivot_movie.corrwith(alice_user_ratings)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


'corrwith' computes pairwise correlation between rows and columns of two dataframes

In [14]:
similar_to_ttm = pivot_movie.corrwith(ttm_user_ratings)
similar_to_alice.head()

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


title
'71 (2014)                                      NaN
'Hellboy': The Seeds of Creation (2004)         NaN
'Round Midnight (1986)                          NaN
'Til There Was You (1997)                  1.000000
'burbs, The (1989)                        -0.475997
dtype: float64

In [15]:
corr_alice = pd.DataFrame(similar_to_alice, columns =['Correlation'])
corr_alice.dropna(inplace = True)
corr_alice.head()

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
'Til There Was You (1997),1.0
"'burbs, The (1989)",-0.475997
(500) Days of Summer (2009),0.076894
*batteries not included (1987),0.881917
...And Justice for All (1979),0.961524


In [16]:
corr_alice = corr_alice.join(ratings['Number of Ratings'])
corr_alice.head()

Unnamed: 0_level_0,Correlation,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),1.0,3
"'burbs, The (1989)",-0.475997,20
(500) Days of Summer (2009),0.076894,37
*batteries not included (1987),0.881917,11
...And Justice for All (1979),0.961524,10


Similar movies like Pulp Fiction (1994)

In [17]:
corr_alice[corr_alice['Number of Ratings']>2].sort_values('Correlation', ascending = False).head(10)

Unnamed: 0_level_0,Correlation,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Curse of the Jade Scorpion, The (2001)",1.0,5
Captain Ron (1992),1.0,5
Semi-Pro (2008),1.0,5
"Train, The (1964)",1.0,3
Let It Ride (1989),1.0,3
Crocodile Dundee in Los Angeles (2001),1.0,8
"Long Riders, The (1980)",1.0,3
Scooby-Doo 2: Monsters Unleashed (2004),1.0,4
"Counterfeiters, The (Die Fälscher) (2007)",1.0,5
Safety Not Guaranteed (2012),1.0,4


Movies similar to The Silence of the Lambs, The (1991)

In [18]:
corr_ttm = pd.DataFrame(similar_to_ttm, columns =['Correlation'])
corr_ttm.dropna(inplace = True)
corr_ttm = corr_ttm.join(ratings['Number of Ratings'])
corr_ttm.head()

Unnamed: 0_level_0,Correlation,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),1.0,3
"'burbs, The (1989)",-0.22809,20
(500) Days of Summer (2009),0.199371,37
*batteries not included (1987),0.803913,11
...And Justice for All (1979),-0.30429,10


In [19]:
corr_ttm[corr_ttm['Number of Ratings']>2].sort_values('Correlation', ascending = False).head(10)

Unnamed: 0_level_0,Correlation,Number of Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Passion of Joan of Arc, The (Passion de Jeanne d'Arc, La) (1928)",1.0,3
Communion (1989),1.0,3
Raiders of the Lost Ark: The Adaptation (1989),1.0,4
Psycho Beach Party (2000),1.0,3
Prince of the City (1981),1.0,3
Priest (2011),1.0,3
"Addiction, The (1995)",1.0,3
Washington Square (1997),1.0,5
Pootie Tang (2001),1.0,3
"Pier, The (Jetée, La) (1962)",1.0,4


In [20]:
!pip install ipywidgets



In [21]:
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Button, HBox, VBox

In [22]:
def get_recommendations(user_id, movie_id, num_recs):
    # generating recommendations 
    recommendations = []
    num_recs = int(num_recs)
    for i in range(num_recs):
        recommendations.append(f"Recommendation {i+1}")
    return recommendations

In [23]:
def create_gui():
    # Creating input fields for user input
    user_id_input = widgets.Text(description="userId:")
    movie_id_input = widgets.Text(description="movieId:")
    num_recs_input = widgets.Text(description="Number of recommendations:")

    # Creating a button for submitting the input
    submit_button = Button(description="Get Recommendations")

    # Defining a function to be called when the button is clicked
    def on_button_clicked(b):
        # Getting the user input
        user_id = user_id_input.value
        movie_id = movie_id_input.value
        num_recs = num_recs_input.value

        # Calling the recommendation module with the user input
        recommendations = get_recommendations(user_id, movie_id, num_recs)

        # Display the recommendations
        display(recommendations)

    # Attaching the button click event to the on_button_clicked function
    submit_button.on_click(on_button_clicked)

    # Display the input fields and button
    display(VBox([user_id_input, movie_id_input, num_recs_input, submit_button]))

In [24]:
create_gui()

VBox(children=(Text(value='', description='userId:'), Text(value='', description='movieId:'), Text(value='', d…

['Recommendation 1',
 'Recommendation 2',
 'Recommendation 3',
 'Recommendation 4']