In [1]:
import pandas as pd
import sklearn

In [2]:
movies = pd.read_csv("data.txt")

In [172]:
movie_list = movies["Movie"].unique()
movie_list = movie_list.tolist()
movie_list

['Star Wars',
 'The Godfather',
 'Titanic',
 'The Matrix',
 'Inception',
 'Pulp Fiction',
 'Forrest Gump']

In [173]:
user_list = movies["User"].unique()
user_list = user_list.tolist()
user_list

['Alice',
 'Frank',
 'Bob',
 'Carol',
 'Dave',
 'Emily',
 'Quincy',
 'Paul',
 'Ivy',
 'Karen',
 'Leo',
 'Mia',
 'Nina',
 'Oscar',
 'Sarah',
 'Tom',
 'Victor',
 'Wendy',
 'Zane',
 'Uma',
 'Xander',
 'Yara']

In [8]:
movies["Clean_Rating"] = movies["Rating"].str.replace("Five", "5", regex=True)
movies["Clean_Rating"] = movies["Clean_Rating"].str.replace(r'[^0-9,.]', '', regex=True)
movies = movies[movies["Clean_Rating"] != ""]

In [9]:
movies["Clean_Rating"] = movies["Clean_Rating"].fillna(0)

In [10]:
movies

Unnamed: 0,User,Movie,Rating,Clean_Rating
0,Alice,Star Wars,5,5.0
1,Frank,The Godfather,4,4.0
2,Bob,Titanic,5,5.0
3,Carol,The Matrix,3,3.0
4,Dave,Inception,2,2.0
5,Emily,Pulp Fiction,4,4.0
6,Quincy,Star Wars,5,5.0
7,Bob,Star Wars,2,2.0
8,Frank,Forrest Gump,3,3.0
9,Alice,The Matrix,4,4.0


In [11]:
movies = movies.reset_index(drop=True)
movies

Unnamed: 0,User,Movie,Rating,Clean_Rating
0,Alice,Star Wars,5,5.0
1,Frank,The Godfather,4,4.0
2,Bob,Titanic,5,5.0
3,Carol,The Matrix,3,3.0
4,Dave,Inception,2,2.0
5,Emily,Pulp Fiction,4,4.0
6,Quincy,Star Wars,5,5.0
7,Bob,Star Wars,2,2.0
8,Frank,Forrest Gump,3,3.0
9,Alice,The Matrix,4,4.0


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["Movie"])

In [14]:
def search(title):
    # title = "Fiction"
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [15]:
search("Gump")

Unnamed: 0,User,Movie,Rating,Clean_Rating
25,Dave,Forrest Gump,5.0,5.0
48,Zane,Forrest Gump,3.5,3.5
18,Emily,Forrest Gump,3.0,3.0
8,Frank,Forrest Gump,3.0,3.0
52,Leo,Forrest Gump,5.0,5.0


In [26]:
vectorizer2 = TfidfVectorizer(ngram_range=(1,2))

tfidf2 = vectorizer2.fit_transform(movies["User"])

In [36]:
#Search User
def search_user(user):
    # title = "Fiction"
    query_vec = vectorizer2.transform([user])
    similarity = cosine_similarity(query_vec, tfidf2).flatten()
    indices = np.argpartition(similarity, -5)[-10:]
    results = movies.iloc[indices][::-1]
    return results

In [37]:
search_user("Dave")

Unnamed: 0,User,Movie,Rating,Clean_Rating
4,Dave,Inception,2,2
12,Dave,Titanic,4,4
25,Dave,Forrest Gump,5,5
18,Emily,Forrest Gump,3,3
20,Ivy,The Matrix,4,4
56,Zane,Star Wars,4,4
19,Quincy,The Matrix,4,4
17,Ivy,Star Wars,5,5
16,Carol,Pulp Fiction,4,4
23,Frank,Star Wars,5,5


In [38]:
search_user("Mia")

Unnamed: 0,User,Movie,Rating,Clean_Rating
29,Mia,The Godfather,5,5
36,Mia,Pulp Fiction,4,4
17,Ivy,Star Wars,5,5
53,Mia,Forrest Gump,3,3
16,Carol,Pulp Fiction,4,4
18,Emily,Forrest Gump,3,3
19,Quincy,The Matrix,4,4
15,Paul,The Godfather,5,5
56,Zane,Star Wars,4,4
22,Bob,The Godfather,4,4


In [39]:
search_user("Frank")

Unnamed: 0,User,Movie,Rating,Clean_Rating
1,Frank,The Godfather,4,4
23,Frank,Star Wars,5,5
8,Frank,Forrest Gump,3,3
24,Ivy,Inception,3,3
19,Quincy,The Matrix,4,4
27,Karen,The Matrix,3,3
18,Emily,Forrest Gump,3,3
17,Ivy,Star Wars,5,5
25,Dave,Forrest Gump,5,5
21,Paul,Inception,1,1


In [40]:
similarity

NameError: name 'similarity' is not defined

In [43]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            #to search user, user search_user method
            display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [45]:
movies["Clean_Rating"] = movies["Clean_Rating"].astype(float)

In [46]:
movies.dtypes

User             object
Movie            object
Rating           object
Clean_Rating    float64
dtype: object

In [82]:
similar_users = movies[(movies["Movie"] == movie) & (movies["Clean_Rating"] >= 3)]["User"].unique()

In [51]:
movie = "Inception"

In [49]:
user_2 = "Carol"

In [84]:
similar_users

array(['Carol', 'Alice', 'Ivy', 'Quincy', 'Karen', 'Sarah', 'Yara'],
      dtype=object)

In [97]:
similar_user_recs = movies[(movies["User"].isin(similar_users)) & (movies["Clean_Rating"] >= 3)]["Movie"]

In [98]:
similar_user_recs

0        Star Wars
3       The Matrix
6        Star Wars
9       The Matrix
11       Inception
14       Inception
16    Pulp Fiction
17       Star Wars
19      The Matrix
20      The Matrix
24       Inception
26       Inception
27      The Matrix
32       Star Wars
34       Inception
39       Inception
45       Inception
48    Pulp Fiction
Name: Movie, dtype: object

In [99]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [100]:
similar_user_recs

Movie
Inception       1.000000
The Matrix      0.714286
Star Wars       0.571429
Pulp Fiction    0.285714
Name: count, dtype: float64

In [101]:
all_users = movies[(movies["Movie"].isin(similar_user_recs.index)) & (movies["Clean_Rating"] >= 3)]

In [102]:
all_users

Unnamed: 0,User,Movie,Rating,Clean_Rating
0,Alice,Star Wars,5,5.0
3,Carol,The Matrix,3,3.0
5,Emily,Pulp Fiction,4,4.0
6,Quincy,Star Wars,5,5.0
9,Alice,The Matrix,4,4.0
11,Carol,Inception,5,5.0
14,Alice,Inception,3,3.0
16,Carol,Pulp Fiction,4,4.0
17,Ivy,Star Wars,5,5.0
19,Quincy,The Matrix,4,4.0


In [112]:
# len(all_users["User"].unique())

all_users["Movie"].value_counts()

Movie
Star Wars       7
The Matrix      7
Pulp Fiction    7
Inception       7
Name: count, dtype: int64

In [103]:
all_user_recs = all_users["Movie"].value_counts() / len(all_users["User"].unique())

In [104]:
all_user_recs

Movie
Star Wars       0.466667
The Matrix      0.466667
Pulp Fiction    0.466667
Inception       0.466667
Name: count, dtype: float64

In [105]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [106]:
rec_percentages

Unnamed: 0_level_0,similar,all
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1
Inception,1.0,0.466667
The Matrix,0.714286,0.466667
Star Wars,0.571429,0.466667
Pulp Fiction,0.285714,0.466667


In [107]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [108]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [109]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Inception,1.0,0.466667,2.142857
The Matrix,0.714286,0.466667,1.530612
Star Wars,0.571429,0.466667,1.22449
Pulp Fiction,0.285714,0.466667,0.612245


In [71]:
def find_similar_movie(movie):
   #Finding recommendations from similar users
   similar_users = movies[(movies["Movie"] == movie) & (movies["Clean_Rating"] >= 3)]["User"].unique()
   similar_user_recs = movies[(movies["User"].isin(similar_users)) & (movies["Clean_Rating"] >= 3)]["Movie"]
   
   #Only 10% of users
   similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
   similar_user_recs = similar_user_recs[similar_user_recs > .10]

   #How common the recommendations were amongst other users
   all_users = movies[(movies["Movie"].isin(similar_user_recs.index)) & (movies["Clean_Rating"] >= 3)]
   all_user_recs = all_users["Movie"].value_counts() / len(all_users["User"].unique())

   rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
   rec_percentages.columns = ["similar", "all"]
    
   rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
   rec_percentages = rec_percentages.sort_values("score", ascending=False)

   return rec_percentages
   

In [72]:
find_similar_movie("Titanic")

Unnamed: 0_level_0,similar,all,score
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Titanic,1.0,0.416667,2.4
The Godfather,0.4,0.5,0.8
Forrest Gump,0.4,0.75,0.533333


In [76]:
movie_input_name = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie = results.iloc[0]["Movie"]
            display(find_similar_movie(movie))

movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [162]:
def user_recommended_movies(user):
    # user_2 = "Frank
    user_watching = movies[(movies["User"] == user) & (movies["Clean_Rating"] >= 3)]
    ## Similar users with ratings over 3 for same movies user likes
    users_watching_similar_movies = movies[(movies["Movie"].isin(user_watching["Movie"])) & (movies["Clean_Rating"] >= 3)]
    # ## We have to find other movies these users have liked that have ratings of >= 3
    similar_user_recs = movies[(movies["User"].isin(users_watching_similar_movies["User"])) & (movies["Clean_Rating"] >= 3)][["Movie", "User"]]
    user_recs = similar_user_recs[(similar_user_recs["User"] != user)]["Movie"].unique()
    # similar_user_recs
    user_recs_list = user_recs.tolist()
    
    return user_recs_list

In [169]:
user_recommended_movies("Mitch")

[]

In [168]:
user_input_name = widgets.Text(
    value="User Name",
    description="Name of User:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        user = data["new"]
        if len(user) > 2:
            results = search_user(user)
            user = results.iloc[0]["User"]
            display(user_recommended_movies(user))

user_input_name.observe(on_type, names="value")

display(user_input_name, recommendation_list)

Text(value='User Name', description='Name of User:')

Output()