In [1]:
import pandas as pd
import numpy as np
import re
from itertools import combinations
from scipy.sparse.linalg import svds
from sklearn.neighbors import KNeighborsRegressor
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import jaccard_score, mean_squared_error

## **Popularity-Based Recommender**

In [2]:
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags.head(25)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
df = (
ratings.groupby('movieId')
       .agg({'userId':['count'], 'rating':['mean']})
)
df.columns=['count', 'rating_mean']
df.sort_values(by=['count', 'rating_mean'], ascending=False, inplace=True)

In [7]:
df.head()

Unnamed: 0_level_0,count,rating_mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,329,4.164134
318,317,4.429022
296,307,4.197068
593,279,4.16129
2571,278,4.192446


In [8]:
# Design the popularity-based movie
df_recomm = df[df['count'] >= 100].sort_values(by='rating_mean', ascending=False)
df_recomm

Unnamed: 0_level_0,count,rating_mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,317,4.429022
858,192,4.289062
2959,218,4.272936
1221,129,4.259690
48516,107,4.252336
...,...,...
185,112,3.040179
434,101,3.034653
586,116,2.995690
153,137,2.916058


## **Content-Based Recommender**
    Recommendations made based on attributes of items.

In [9]:
# Attribute based recommender.
def con_attr_recomm(movieId):
    
    tag_movieId = []
    
    for tag in tags[tags['movieId'] == movieId].tag.tolist():
        for lst in list(tags[tags['tag'] == tag].movieId):
            tag_movieId.append(lst)
    
    movies_name = []
    
    for movieID in tag_movieId:
        for movie_genre in list(movies[movies['movieId'] == movieId].genres)[0].split('|'):
            if movie_genre in list(movies[movies['movieId'] == movieID].genres)[0].split('|'):
                movies_name.append(movies[movies['movieId'] == movieID])
        
    return pd.concat(movies_name).drop_duplicates().head(20)

In [10]:
con_attr_recomm(106782).head()

Unnamed: 0,movieId,title,genres
8305,106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
3016,4034,Traffic (2000),Crime|Drama|Thriller
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
5610,27020,Gia (1998),Drama|Romance
1430,1953,"French Connection, The (1971)",Action|Crime|Thriller


#### **Jaccard Similarity Content-Based Recommender**

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [13]:
movie_attributes = movies.genres.str.split('|')
movie_attributes

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

In [14]:
# create the vector matrix.
movies_attr = []

for num in range(9742):
    for elem in movie_attributes[num]:
        movies_attr.append([movies.title.iloc[num], elem])

movie_vector_matrix = pd.crosstab(pd.DataFrame(movies_attr)[0], pd.DataFrame(movies_attr)[1])
movie_vector_matrix.head()

1,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
'71 (2014),0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
'Hellboy': The Seeds of Creation (2004),0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
'Round Midnight (1986),0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
'Salem's Lot (2004),0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
'Til There Was You (1997),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [15]:
# Calculate all pairwise distances
jaccard_distances = pdist(movie_vector_matrix.values, metric='jaccard')

# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, columns=movie_vector_matrix.index, index=movie_vector_matrix.index)

# Print the top 5 rows of the DataFrame
jaccard_similarity_df.head()

Unnamed: 0_level_0,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.125,0.2,0.333333,0.2,0.0,0.0,0.25,0.166667,0.0,...,0.4,0.4,0.2,0.2,0.2,0.4,0.4,0.4,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.125,1.0,0.0,0.0,0.0,0.0,0.2,0.0,0.142857,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.166667,0.166667
'Round Midnight (1986),0.2,0.0,1.0,0.2,0.333333,0.0,0.0,0.5,0.25,0.0,...,0.25,0.25,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333
'Salem's Lot (2004),0.333333,0.0,0.2,1.0,0.2,0.0,0.0,0.25,0.166667,0.0,...,0.4,0.75,0.5,0.5,0.2,0.166667,0.166667,0.166667,0.0,0.0
'Til There Was You (1997),0.2,0.0,0.333333,0.2,1.0,0.5,0.0,0.5,0.666667,0.0,...,0.25,0.25,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0


In [16]:
# Create function to extract movie recommendation
def Jaccard_recommender(movie_name):

    for elem1 in list(movies.title):
        lst = []
        for elem in re.sub('[\d,)(\']', '', movie_name).lower().strip().split():
            if elem in re.sub('[\d,)(\']', '', elem1).lower().strip().split():
                lst.append(True)
            else:
                lst.append(False)

        if np.prod(lst) == 1:
            break

    return jaccard_similarity_df[[elem1]].sort_values(by=[elem1], ascending=False)

In [17]:
Jaccard_recommender('Antonia\'s Line')

Unnamed: 0_level_0,Antonia's Line (Antonia) (1995)
0,Unnamed: 1_level_1
Thumbsucker (2005),1.0
Terms of Endearment (1983),1.0
Alice Adams (1935),1.0
For Love of the Game (1999),1.0
Carnage (2011),1.0
...,...
Silverado (1985),0.0
Helvetica (2007),0.0
Hellsinki (Rööperi) (2009),0.0
Hellraiser: Bloodline (1996),0.0


#### **TF-IDF Content-Based Recommender**

In [18]:
# Create description from the movie tags.
movie_desc = {}

for elem in tags.movieId.value_counts().index:
    movie_desc[elem] = ' '.join(list(set(tags[tags.movieId == elem].tag.values)))
    
movie_desc = pd.DataFrame(movie_desc, index=range(0,1)).T
movie_desc.columns = ['Description']
movie_desc.head()

Unnamed: 0,Description
296,storytelling coke intertwining storylines nonl...
2959,Nudity (Topless) challenging fighting narrated...
924,superb soundtrack music visually appealing cla...
293,assassin Lolita theme Gary Oldman assassinatio...
7361,quirky romantic melancholy dreamlike thought-p...


In [19]:
# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer(min_df=5, max_df=0.7)

# Fit and transform the Description column
vectorized_data = vectorizer.fit_transform(movie_desc['Description'])

# Look at the features generated
vectorizer.get_feature_names_out()

array(['250', 'aardman', 'abuse', 'acting', 'action', 'actress', 'adam',
       'adolescence', 'adultery', 'adventure', 'africa', 'al',
       'alcoholism', 'aliens', 'alternate', 'america', 'amnesia', 'and',
       'animal', 'animation', 'anime', 'anti', 'apocalyptic', 'appealing',
       'arthur', 'artificial', 'arts', 'assassination', 'astaire',
       'atmospheric', 'australia', 'bad', 'baseball', 'based',
       'beautiful', 'ben', 'best', 'bible', 'big', 'biopic',
       'bittersweet', 'black', 'book', 'boxing', 'brad', 'british',
       'brothers', 'business', 'cerebral', 'characters', 'charles',
       'children', 'christmas', 'christopher', 'cinematography', 'civil',
       'classic', 'clever', 'coen', 'comedy', 'comic', 'commentary',
       'corruption', 'court', 'creepy', 'crime', 'cross', 'cult', 'dance',
       'dark', 'death', 'depressing', 'dialogue', 'dicaprio', 'dickens',
       'disability', 'disney', 'disturbing', 'divorce', 'dogs', 'drag',
       'drama', 'dreamlike

In [20]:
# Create Dataframe from TF-IDFarray
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())

# # Assign the index and inspect
tfidf_df.index = movie_desc.index
tfidf_df.head()

Unnamed: 0,250,aardman,abuse,acting,action,actress,adam,adolescence,adultery,adventure,...,wedding,white,will,williams,witty,wizards,world,writing,york,zombies
296,0.064615,0.0,0.0,0.069073,0.117995,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.069073,0.0,0.0,0.069073,0.0,0.0
2959,0.146836,0.0,0.0,0.156969,0.134071,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
924,0.151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293,0.172081,0.0,0.0,0.183956,0.157121,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# remove index where all columns have zero values
tfidf_df['check'] = tfidf_df.sum(axis=1)
tfidf_df.head()

Unnamed: 0,250,aardman,abuse,acting,action,actress,adam,adolescence,adultery,adventure,...,white,will,williams,witty,wizards,world,writing,york,zombies,check
296,0.064615,0.0,0.0,0.069073,0.117995,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.069073,0.0,0.0,0.069073,0.0,0.0,6.608299
2959,0.146836,0.0,0.0,0.156969,0.134071,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.970247
924,0.151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.739206
293,0.172081,0.0,0.0,0.183956,0.157121,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.587502
7361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.277232


In [22]:
tfidf_df = tfidf_df[tfidf_df.check != 0]
tfidf_df = tfidf_df.drop('check', axis=1)
tfidf_df.head()

Unnamed: 0,250,aardman,abuse,acting,action,actress,adam,adolescence,adultery,adventure,...,wedding,white,will,williams,witty,wizards,world,writing,york,zombies
296,0.064615,0.0,0.0,0.069073,0.117995,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.069073,0.0,0.0,0.069073,0.0,0.0
2959,0.146836,0.0,0.0,0.156969,0.134071,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
924,0.151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293,0.172081,0.0,0.0,0.183956,0.157121,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

# Print the top 5 rows of the DataFrame
cosine_similarity_df.head()

Unnamed: 0,296,2959,924,293,7361,1732,4878,260,79132,135536,...,3181,3192,3210,858,3250,3259,3281,3317,830,2719
296,1.0,0.409126,0.220011,0.255846,0.212811,0.570116,0.158007,0.142517,0.17438,0.18806,...,0.0,0.0,0.0,0.063734,0.0,0.0,0.065572,0.069073,0.0,0.0
2959,0.409126,1.0,0.177441,0.230867,0.207996,0.303546,0.431725,0.098861,0.315212,0.031055,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
924,0.220011,0.177441,1.0,0.077608,0.120453,0.224536,0.156709,0.593326,0.190235,0.127697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293,0.255846,0.230867,0.077608,1.0,0.104179,0.176423,0.031084,0.064753,0.060874,0.07448,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7361,0.212811,0.207996,0.120453,0.104179,1.0,0.228181,0.454024,0.0,0.321071,0.045169,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
cosine_similarity_df[260].sort_values(ascending=False)

260     1.000000
1200    0.721632
1196    0.720505
4812    0.706099
7649    0.706099
          ...   
6195    0.000000
6181    0.000000
6031    0.000000
6157    0.000000
2719    0.000000
Name: 260, Length: 1179, dtype: float64

## **Collaborative Filtering**

    Are of two types
**Item-Based and User-Based**
    
    While in item-based recommenders, products are recommended based on similarity of user's rating about products, a user-based recommenders recommends product based on past similar of user's.
    
**Item-based recommendation:**
    
    Used for conservative or fact-based suggestions e.g. Ecommerce store.
    
   **Pros:**
   
    1. Are more consistent over time. People change preference.
    
    2. Are easier to explain.
    
    3. Can be pre-calculated since item inventory may be static for a longer time. User-based will need constant recalculation to accommodate new users.
    
   **Cons:**
   
    1. Are very obvious suggestions, therefore not very unexpected suggestions are made.
    
**User-based recommendation:**

    Useful for finding less popular items user may like, especially very subjective items such movies, books or other entertainment.
    
   **Pros:**
   
    1. Create a lot more interesting and unexpected suggestions.
    
   **Cons:**
   
    2. Does not perform well against item-based on using standard metrics.

#### **Pearson Item-Based Recommender**

In [25]:
movie_item = (pd.pivot_table(data=ratings, values='rating', 
                             index='userId', columns='movieId'))
movie_item.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [26]:
def pearson_item_recomm(movieId_num, nom):
    
    # calculate the pearson correlation with the movie of choice.
    pearson_similar = pd.DataFrame(movie_item.loc[:, list(set(movie_item.columns) - set([movieId_num]))]\
                        .corrwith(movie_item[movieId_num]))\
                        .rename(columns={0:'PearsonCoeff'})\
                        .sort_values(by=['PearsonCoeff'], ascending=False)\
                        .reset_index()
    
    # select movies with constrained pearson.
    pearson_similar = pearson_similar[pearson_similar['PearsonCoeff'] >= 0.5]
    
    
    # further select from the pearson_similarity movies based on similarity of attributes.
    movies_name = []
    
    for movieID in list(pearson_similar.movieId):
        if len(list(movies[movies['movieId'] == movieId_num].genres)[0].split('|')) == 1:
            for movie_genre in list(movies[movies['movieId'] == movieId_num].genres)[0].split('|'):
                if movie_genre in list(movies[movies['movieId'] == movieID].genres)[0].split('|'):
                    movies_name.append(movies[movies['movieId'] == movieID])
        elif len(list(movies[movies['movieId'] == movieId_num].genres)[0].split('|')) > 1:
            for movie_genre in list(combinations(list(movies[movies['movieId'] == movieId_num].genres)[0].split('|'), 2)):
                if (movie_genre[0] in list(movies[movies['movieId'] == movieID].genres)[0].split('|'))\
                & (movie_genre[1] in list(movies[movies['movieId'] == movieID].genres)[0].split('|')):
                    movies_name.append(movies[movies['movieId'] == movieID])
    
    return pd.concat(movies_name).drop_duplicates().head(nom)

In [27]:
movies[movies['movieId'] == 2857]

Unnamed: 0,movieId,title,genres
2144,2857,Yellow Submarine (1968),Adventure|Animation|Comedy|Fantasy|Musical


In [28]:
pearson_item_recomm(2857, 50).head()

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,movieId,title,genres
6729,59022,Harold & Kumar Escape from Guantanamo Bay (2008),Adventure|Comedy
2445,3258,Death Becomes Her (1992),Comedy|Fantasy
1556,2093,Return to Oz (1985),Adventure|Children|Fantasy
3093,4153,Down to Earth (2001),Comedy|Fantasy|Romance
732,952,Around the World in 80 Days (1956),Adventure|Comedy


#### **Cosine_Similiarity Item-Based Recommender**

In [29]:
# Create the matrix for the recommender based on the item
movie_itemm = (pd.pivot_table(data=ratings, values='rating', 
                             index='userId', columns='movieId', fill_value=0))

movie_itemm

movie_similiarity_matrix = pd.DataFrame(cosine_similarity(movie_itemm.T), 
                                        columns=movie_itemm.columns, 
                                        index=movie_itemm.columns)
movie_similiarity_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.410562,1.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.296917,0.282438,1.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035573,0.106415,0.092406,1.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.308762,0.287795,0.417802,0.188376,1.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
movie_similiarity_matrix[19].reset_index().rename(columns={19:'Cosine_corr'}).sort_values(by='Cosine_corr', ascending=False).head()

Unnamed: 0,movieId,Cosine_corr
18,19,1.0
302,344,0.60496
325,367,0.514114
197,231,0.502934
1,2,0.49756


In [31]:
# Create the cosine similarity function
def cosine_recomm(movieId_num, nom):
    
    
    # Create the cosine similarity matrix.
    movie_itemm = (pd.pivot_table(data=ratings, values='rating', 
                             index='userId', columns='movieId', fill_value=0))
    movie_itemm
    movie_similiarity_matrix = pd.DataFrame(cosine_similarity(movie_itemm.T),\
                                            columns=movie_itemm.columns, index=movie_itemm.columns)
    
    movieId_matrix = movie_similiarity_matrix[movieId_num].reset_index()\
                                                          .rename(columns={movieId_num:'Cosine_corr'})\
                                                          .sort_values(by='Cosine_corr', ascending=False)
    
    # select movies based on the constraint below.
    movieId_matrix = movieId_matrix[movieId_matrix['Cosine_corr'] >= 0.3]
    
    # apply content based filtering
    movies_name = []
    
    for movieID in list(movieId_matrix.movieId):
        if len(list(movies[movies['movieId'] == movieId_num].genres)[0].split('|')) == 1:
            for movie_genre in list(movies[movies['movieId'] == movieId_num].genres)[0].split('|'):
                if movie_genre in list(movies[movies['movieId'] == movieID].genres)[0].split('|'):
                    movies_name.append(movies[movies['movieId'] == movieID])
        elif len(list(movies[movies['movieId'] == movieId_num].genres)[0].split('|')) > 1:
            for movie_genre in list(combinations(list(movies[movies['movieId'] == movieId_num].genres)[0].split('|'), 2)):
                if (movie_genre[0] in list(movies[movies['movieId'] == movieID].genres)[0].split('|'))\
                & (movie_genre[1] in list(movies[movies['movieId'] == movieID].genres)[0].split('|')):
                    movies_name.append(movies[movies['movieId'] == movieID])
                
    return pd.concat(movies_name).drop_duplicates().head(nom)

In [32]:
list(movies[movies['movieId'] == 79008].genres)[0].split('|')

['Animation', 'Comedy', 'Fantasy']

In [33]:
cosine_recomm(79008, 50).head()

Unnamed: 0,movieId,title,genres
7368,79008,South Park: Imaginationland (2008),Animation|Comedy|Fantasy
8339,107953,Dragon Ball Z: Battle of Gods (2013),Action|Animation|Fantasy|IMAX
6778,60161,Futurama: The Beast with a Billion Backs (2008),Action|Animation|Comedy|Romance|Sci-Fi
5637,27369,Daria: Is It Fall Yet? (2000),Animation|Comedy
7530,84637,Gnomeo & Juliet (2011),Adventure|Animation|Children|Comedy|Fantasy|Ro...


#### **KNN USER-Based and ITEM-Based Collaborative Filtering**

In [34]:
movie_item = (pd.pivot_table(data=ratings, values='rating', 
                             index='userId', columns='movieId'))
movie_item.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [35]:
# Design function for calculating the prediction of a user based on user_recommendation 
def userpred_rating(user, n_neighbors, item):

    # Get the average rating for each user 
    user_avg_ratings = movie_item.mean(axis=1)

    # Center each users ratings around 0
    movie_user_centered = movie_item.sub(user_avg_ratings, axis=0)

    # Fill in the missing data with 0s
    movie_user_normed = movie_user_centered.fillna(0)

    # Generate the similarity matrix
    similarities_user = cosine_similarity(movie_user_normed)

    # Wrap the similarities in a DataFrame
    user_cosine_similarity_df = pd.DataFrame(similarities_user, 
                                             index=movie_user_normed.index, 
                                             columns=movie_user_normed.index)

    # Isolate the similarity scores for user_1 and sort
    user_similarity_series = user_cosine_similarity_df.loc[user]
    ordered_similarities = user_similarity_series.sort_values(ascending=False)

    # Find the top 10 most similar users
    nearest_neighbors = ordered_similarities[1:n_neighbors+1].index

    # Extract the ratings of the neighbors
    neighbor_ratings = movie_item.reindex(nearest_neighbors)

    # Calculate the mean rating given by the users nearest neighbors
    return neighbor_ratings[item].mean()

In [36]:
userpred_rating(610, 10, 7)

2.75

In [37]:
# Design function for calculating the prediction of a user based on user_recommendation 
def itempred_rating(item, n_neighbors, user):

    # Get the average rating for each user
    movie_items = movie_item.T
    item_avg_ratings = movie_items.mean(axis=1)

    # Center each users ratings around 0
    movie_item_centered = movie_items.sub(item_avg_ratings, axis=0)

    # Fill in the missing data with 0s
    movie_item_normed = movie_item_centered.fillna(0)

    # Generate the similarity matrix
    similarities_item = cosine_similarity(movie_item_normed)

    # Wrap the similarities in a DataFrame
    item_cosine_similarity_df = pd.DataFrame(similarities_item, 
                                             index=movie_item_normed.index, 
                                             columns=movie_item_normed.index)

    # Isolate the similarity scores for item of interest and sort
    item_similarity_series = item_cosine_similarity_df.loc[item]
    ordered_similarities = item_similarity_series.sort_values(ascending=False)

    # Find the top 10 most similar items
    nearest_neighbors = ordered_similarities[1:n_neighbors+1].index

    # Extract the ratings of the neighbors
    neighbor_ratings = movie_items.reindex(nearest_neighbors)

    # Calculate the mean rating given by the item nearest neighbors
    return neighbor_ratings[user].mean()

In [38]:
itempred_rating(3, 9, 6)

3.5

In [39]:
# alternate function based on the knn model.
def knnuser_predrating(user, n_neighbors, item):

    # Get the average rating for each user 
    user_avg_ratings = movie_item.mean(axis=1)

    # Center each users ratings around 0
    movie_user_centered = movie_item.sub(user_avg_ratings, axis=0)

    # Fill in the missing data with 0s
    movie_user_normed = movie_user_centered.fillna(0)

    # Drop the item column you are trying to obtain a user's rating.
    movie_user_normed1 = movie_user_normed.drop(item, axis=1)

    # Get the data for the user of interest.
    target_user_x = movie_user_normed1.loc[[user]]

    # Get the target item column to be predicted from movie_item table.
    other_users_y = movie_item[item]

    # Get the data for only those that have seen the movie
    other_users_x = movie_user_normed1[other_users_y.notnull()]

    # Remove those that have not seen the movie from the target item column.
    other_users_y.dropna(inplace=True)

    # Instantiate the user KNN model
    user_knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='cosine')

    # Fit the model and predict the target user
    user_knn.fit(other_users_x, other_users_y)
    user_user_pred = user_knn.predict(target_user_x)
    
    return user_user_pred

In [40]:
knnuser_predrating(610, 10, 7)

array([2.65])

In [41]:
# alternate function based on the knn model.
def knnitem_predrating(item, n_neighbors, user):

    # Get the average rating for each item
    movie_items = movie_item.T
    user_avg_ratings = movie_items.mean(axis=1)

    # Center each users ratings around 0
    movie_user_centered = movie_items.sub(user_avg_ratings, axis=0)

    # Fill in the missing data with 0s
    movie_user_normed = movie_user_centered.fillna(0)

    # Drop the user column you are trying to obtain an item's rating.
    movie_user_normed1 = movie_user_normed.drop(user, axis=1)

    # Get the data for the item of interest.
    target_user_x = movie_user_normed1.loc[[item]]

    # Get the target user column to be predicted from movie_item table.
    other_users_y = movie_items[user]

    # Get the data for only those that have seen the movie
    other_users_x = movie_user_normed1[other_users_y.notnull()]

    # Remove those that have not seen the item from the target user column.
    other_users_y.dropna(inplace=True)

    # Instantiate the user KNN model
    user_knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='cosine')

    # Fit the model and predict the target user
    user_knn.fit(other_users_x, other_users_y)
    user_user_pred = user_knn.predict(target_user_x)
    
    return user_user_pred

In [42]:
knnitem_predrating(3, 9, 6)

array([3.66666667])

### **Singular Value Decomposition (SVD)**

In [43]:
movie_item

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [44]:
# normalize the ratings and fill in the values with zero.
movie_item_centered = movie_item.mean(axis=1)

movie_item_norm = movie_item.sub(movie_item_centered, axis=0).fillna(0)

movie_item_norm

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.000000,-0.366379,0.0,0.0,-0.366379,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,0.000000,0.000000,0.0,0.0,0.000000,-1.157399,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.213904,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-0.634176,-1.134176,-1.134176,0.0,0.0,0.000000,0.000000,0.0,0.0,0.865824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.270270,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.729730,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# Decompose the matrix
Matrix1, Matrix2, Matrix3 = svds(movie_item_norm)
Matrix1

array([[-0.01971204, -0.00559547,  0.00923235,  0.00025015,  0.00805549,
        -0.02440992],
       [ 0.00237743,  0.00095257, -0.00139751, -0.00140941, -0.00273874,
         0.00069979],
       [ 0.00952025,  0.00984733, -0.01217854,  0.00325588, -0.00643685,
         0.00370152],
       ...,
       [ 0.15025037, -0.08386709, -0.16033712, -0.09723141,  0.07664185,
        -0.12897078],
       [-0.0049617 , -0.00286833, -0.00020831, -0.00452691,  0.00173389,
        -0.00379528],
       [ 0.00368165,  0.0912081 , -0.13517713, -0.0639711 , -0.12510036,
        -0.16681129]])

In [46]:
Matrix2

array([36.54895519, 37.95619249, 39.37050585, 41.77917206, 43.6224036 ,
       76.20046537])

In [47]:
Matrix3

array([[-1.67147499e-03, -9.55423162e-03, -3.90058064e-02, ...,
         6.61905229e-06,  6.61905229e-06, -4.57063258e-04],
       [-3.93455329e-02,  6.62574637e-03,  4.72586077e-03, ...,
        -1.98672513e-05, -1.98672513e-05,  1.10523186e-04],
       [ 2.49144795e-02,  6.00042907e-03,  1.50992801e-02, ...,
         3.08304882e-05,  3.08304882e-05, -3.47153639e-05],
       [-1.94504559e-03,  8.86245408e-03,  1.86001244e-02, ...,
        -1.79979278e-05, -1.79979278e-05, -3.14547180e-04],
       [-4.23438184e-02, -1.08013780e-02,  6.78593897e-04, ...,
         7.56160835e-05,  7.56160835e-05, -1.63799332e-04],
       [-4.95781986e-02,  1.13700425e-02,  5.51975836e-03, ...,
         2.71514179e-05,  2.71514179e-05, -1.92034160e-04]])

In [48]:
# convert matrix2 to diagonal matrix.
Matrix2 = np.diag(Matrix2)
Matrix2

array([[36.54895519,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        , 37.95619249,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        , 39.37050585,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        , 41.77917206,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        , 43.6224036 ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        76.20046537]])

In [49]:
# get the dot product of the first 2 matrices
Matrix1_Matrix2 = np.dot(Matrix1, Matrix2)
Matrix1_Matrix2

array([[-7.20454492e-01, -2.12382840e-01,  3.63482341e-01,
         1.04508578e-02,  3.51399750e-01, -1.86004692e+00],
       [ 8.68924639e-02,  3.61559914e-02, -5.50205587e-02,
        -5.88841846e-02, -1.19470609e-01,  5.33242153e-02],
       [ 3.47955018e-01,  3.73767090e-01, -4.79475258e-01,
         1.36028000e-01, -2.80791085e-01,  2.82057624e-01],
       ...,
       [ 5.49149406e+00, -3.18327553e+00, -6.31255349e+00,
        -4.06224789e+00,  3.34330177e+00, -9.82763379e+00],
       [-1.81344865e-01, -1.08870996e-01, -8.20114064e-03,
        -1.89130446e-01,  7.56365345e-02, -2.89202309e-01],
       [ 1.34560501e-01,  3.46191239e+00, -5.32199190e+00,
        -2.67265961e+00, -5.45717846e+00, -1.27110978e+01]])

In [50]:
# dot product of the remaining matrix.
Matrix1_Matrix2_Matrix3 = np.dot(Matrix1_Matrix2, Matrix3)
Matrix1_Matrix2_Matrix3

array([[ 9.59343518e-02, -1.71945495e-02,  2.27523739e-02, ...,
        -1.34624568e-05, -1.34624568e-05,  5.89547842e-04],
       [-4.08968238e-04,  4.54110162e-04, -4.93120168e-03, ...,
        -8.36576329e-06, -8.36576329e-06, -5.95822929e-06],
       [-2.95922628e-02,  3.72046555e-03, -1.51491416e-02, ...,
        -3.59273284e-05, -3.59273284e-05, -1.52040897e-04],
       ...,
       [ 3.12364266e-01, -2.95290977e-01, -4.52094620e-01, ...,
        -3.59424467e-05, -3.59424467e-05, -2.52639026e-05],
       [ 1.58856307e-02, -4.81933236e-03,  1.37231199e-03, ...,
         1.98083300e-06,  1.98083300e-06,  1.73775968e-04],
       [ 5.97439189e-01, -1.19549102e-01, -1.92823569e-01, ...,
        -9.41640085e-04, -9.41640085e-04,  4.68139853e-03]])

In [51]:
# obtain the final rating from the decomposition result and convert to dataframe.
SVD_ratings = Matrix1_Matrix2_Matrix3 + movie_item_centered.values.reshape(-1, 1)

calc_pred = pd.DataFrame(SVD_ratings, index=movie_item.index, columns=movie_item.columns)
calc_pred

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.462314,4.349185,4.389132,4.355828,4.304889,4.379954,4.339853,4.353081,4.371022,4.304191,...,4.366366,4.366333,4.366399,4.366399,4.366366,4.366399,4.366366,4.366366,4.366366,4.366969
2,3.947867,3.948730,3.943345,3.948975,3.954781,3.955014,3.950488,3.949590,3.946458,3.960781,...,3.948267,3.948247,3.948288,3.948288,3.948267,3.948288,3.948267,3.948267,3.948267,3.948270
3,2.406305,2.439618,2.420748,2.440964,2.468875,2.461135,2.451152,2.439374,2.424418,2.481790,...,2.435862,2.435774,2.435949,2.435949,2.435862,2.435949,2.435862,2.435862,2.435862,2.435745
4,3.616146,3.556886,3.576326,3.551370,3.500886,3.497739,3.521390,3.555289,3.562753,3.490580,...,3.555621,3.555780,3.555462,3.555462,3.555621,3.555462,3.555621,3.555621,3.555621,3.555394
5,3.653970,3.633074,3.646911,3.631662,3.620151,3.628583,3.628683,3.635113,3.640487,3.596743,...,3.636387,3.636443,3.636330,3.636330,3.636387,3.636330,3.636387,3.636387,3.636387,3.636556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.897556,3.566707,3.688992,3.599346,3.434501,3.736443,3.544275,3.646346,3.650694,3.282278,...,3.657630,3.658191,3.657068,3.657068,3.657630,3.657068,3.657630,3.657630,3.657630,3.659773
607,3.818336,3.773703,3.799590,3.778775,3.742178,3.779247,3.769289,3.778643,3.796693,3.733471,...,3.786128,3.786204,3.786051,3.786051,3.786128,3.786051,3.786128,3.786128,3.786128,3.786412
608,3.446540,2.838885,2.682081,3.117258,2.706645,3.795240,2.909024,3.114491,2.974129,3.601215,...,3.134140,3.134052,3.134227,3.134227,3.134140,3.134227,3.134140,3.134140,3.134140,3.134150
609,3.286156,3.265451,3.271643,3.267343,3.260062,3.277101,3.266075,3.268635,3.272298,3.258482,...,3.270272,3.270277,3.270267,3.270267,3.270272,3.270267,3.270272,3.270272,3.270272,3.270444


In [52]:
movie_item

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


### **Evaluation**

In [53]:
# Extract the ground truth to compare your predictions against
actual_values = movie_item.iloc[:200, :3800].values
predicted_values = calc_pred.iloc[:200, :3800].values

In [54]:
# Create a mask of actual_values to only look at the non-missing values in the ground truth
mask = ~np.isnan(actual_values)

In [55]:
# Print the performance of both predictions and compare
mean_squared_error(actual_values[mask], predicted_values[mask], squared=False)

0.898014724749699