In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("data.txt")

In [3]:
movies

Unnamed: 0,User,Movie,Rating
0,Alice,Star Wars,5
1,Frank,The Godfather,4
2,Bob,Titanic,5
3,Carol,The Matrix,3
4,Dave,Inception,2
5,Emily,Pulp Fiction,4
6,Quincy,Star Wars,5
7,Bob,Star Wars,2
8,Frank,Forrest Gump,3
9,Alice,The Matrix,4


In [4]:
new_df = movies.set_index(["User", "Movie"]).Rating.unstack()

In [5]:
new_df = new_df.fillna(0)

In [6]:
new_df

Movie,Forrest Gump,Inception,Matrix,Pulp Fiction,Star Wars,The Godfather,The Matrix,Titanic
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alice,0.0,3,0,0,5,0,4.0,0
Bob,0.0,0,0,0,2,4,0.0,5
Carol,0.0,5,0,4,0,0,3.0,0
Dave,5.0,2,0,0,0,0,0.0,4
Emily,3.0,0,0,4,0,5,0.0,0
Frank,3.0,0,0,0,5,4,0.0,0
Ivy,0.0,3,0,0,5,0,4.0,0
Karen,0.0,5,0,4,0,0,3.0,0
Leo,5.0,2,0,0,0,0,0.0,4
Mia,3.0,0,0,4,0,5,0.0,0


In [7]:
new_df = new_df.replace("Five", "5", regex=True)
new_df = new_df.replace(r'[^0-9,.]', '', regex=True)
new_df = new_df.fillna(0)

In [8]:
new_df = new_df.drop("Matrix", axis=1)

In [9]:
new_df

Movie,Forrest Gump,Inception,Pulp Fiction,Star Wars,The Godfather,The Matrix,Titanic
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alice,0.0,3,0.0,5,0,4.0,0
Bob,0.0,0,0.0,2,4,0.0,5
Carol,0.0,5,4.0,0,0,3.0,0
Dave,5.0,2,0.0,0,0,0.0,4
Emily,3.0,0,4.0,0,5,0.0,0
Frank,3.0,0,0.0,5,4,0.0,0
Ivy,0.0,3,0.0,5,0,4.0,0
Karen,0.0,5,4.0,0,0,3.0,0
Leo,5.0,2,0.0,0,0,0.0,4
Mia,3.0,0,4.0,0,5,0.0,0


In [10]:
##Not all movies are rated

def standardize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

In [11]:
new_df_5 = new_df
new_df_5.columns = [a for a in new_df_5.columns]
new_df_5.reset_index(inplace=True)
new_df_5 = new_df_5.set_index("User")

In [13]:
new_df_5 = new_df_5.astype({"Forrest Gump" : "float", 
                        "Inception" : "float", 
                        "Pulp Fiction": "float", 
                        "Star Wars" : "float",
                        "The Godfather" : "float", 
                        "The Matrix" : "float",
                        "Titanic" : "float"
                       })


In [14]:
new_df_5.dtypes

Forrest Gump     float64
Inception        float64
Pulp Fiction     float64
Star Wars        float64
The Godfather    float64
The Matrix       float64
Titanic          float64
dtype: object

In [16]:
ratings_std = new_df_5.apply(standardize)
ratings_std

Unnamed: 0_level_0,Forrest Gump,Inception,Pulp Fiction,Star Wars,The Godfather,The Matrix,Titanic
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alice,-0.322727,0.3,-0.277273,0.654545,-0.245455,0.710227,-0.2
Bob,-0.322727,-0.3,-0.277273,0.054545,0.554545,-0.289773,0.8
Carol,-0.322727,0.7,0.522727,-0.345455,-0.245455,0.460227,-0.2
Dave,0.677273,0.1,-0.277273,-0.345455,-0.245455,-0.289773,0.6
Emily,0.277273,-0.3,0.522727,-0.345455,0.754545,-0.289773,-0.2
Frank,0.277273,-0.3,-0.277273,0.654545,0.554545,-0.289773,-0.2
Ivy,-0.322727,0.3,-0.277273,0.654545,-0.245455,0.710227,-0.2
Karen,-0.322727,0.7,0.522727,-0.345455,-0.245455,0.460227,-0.2
Leo,0.677273,0.1,-0.277273,-0.345455,-0.245455,-0.289773,0.6
Mia,0.277273,-0.3,0.522727,-0.345455,0.754545,-0.289773,-0.2


In [17]:
item_similarity = cosine_similarity(ratings_std.T)
item_similarity

array([[ 1.        , -0.39600573, -0.03499441, -0.17443   ,  0.11537951,
        -0.27567356,  0.05412107],
       [-0.39600573,  1.        , -0.07350166,  0.0872966 , -0.42538827,
         0.39190687, -0.1693892 ],
       [-0.03499441, -0.07350166,  1.        , -0.41816268,  0.03048695,
         0.1489731 , -0.39370708],
       [-0.17443   ,  0.0872966 , -0.41816268,  1.        ,  0.01381862,
         0.19055195, -0.19834622],
       [ 0.11537951, -0.42538827,  0.03048695,  0.01381862,  1.        ,
        -0.41044977,  0.15730574],
       [-0.27567356,  0.39190687,  0.1489731 ,  0.19055195, -0.41044977,
         1.        , -0.3636672 ],
       [ 0.05412107, -0.1693892 , -0.39370708, -0.19834622,  0.15730574,
        -0.3636672 ,  1.        ]])

In [18]:
item_similarity_df = pd.DataFrame(item_similarity, index=new_df_5.columns, columns=new_df_5.columns)

In [19]:
item_similarity_df

Unnamed: 0,Forrest Gump,Inception,Pulp Fiction,Star Wars,The Godfather,The Matrix,Titanic
Forrest Gump,1.0,-0.396006,-0.034994,-0.17443,0.11538,-0.275674,0.054121
Inception,-0.396006,1.0,-0.073502,0.087297,-0.425388,0.391907,-0.169389
Pulp Fiction,-0.034994,-0.073502,1.0,-0.418163,0.030487,0.148973,-0.393707
Star Wars,-0.17443,0.087297,-0.418163,1.0,0.013819,0.190552,-0.198346
The Godfather,0.11538,-0.425388,0.030487,0.013819,1.0,-0.41045,0.157306
The Matrix,-0.275674,0.391907,0.148973,0.190552,-0.41045,1.0,-0.363667
Titanic,0.054121,-0.169389,-0.393707,-0.198346,0.157306,-0.363667,1.0


In [20]:
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)

    return similar_score

In [21]:
get_similar_movies("Titanic", 1)

Pulp Fiction     0.590561
The Matrix       0.545501
Star Wars        0.297519
Inception        0.254084
Forrest Gump    -0.081182
The Godfather   -0.235959
Titanic         -1.500000
Name: Titanic, dtype: float64

In [22]:
user_1 = [("Titanic", 1), ("Pulp Fiction", 4), ("Star Wars", 5)]

In [23]:
similar_movies = pd.DataFrame()

In [24]:
for movie,rating in user_1:
    similar_movies = similar_movies._append(get_similar_movies(movie, rating))

similar_movies

Unnamed: 0,Pulp Fiction,The Matrix,Star Wars,Inception,Forrest Gump,The Godfather,Titanic
Titanic,0.590561,0.545501,0.297519,0.254084,-0.081182,-0.235959,-1.5
Pulp Fiction,1.5,0.22346,-0.627244,-0.110252,-0.052492,0.04573,-0.590561
Star Wars,-1.045407,0.47638,2.5,0.218242,-0.436075,0.034547,-0.495866
