# Movies recommend system

# Import libaries

In [1]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display
from ipywidgets import interact, interactive, interactive_output

---

# Import and clean Data 

In [2]:
movies = pd.read_csv(r"C:\Users\YourName\OneDrive\เดสก์ท็อป\Whatever\Language\Computer\Portfolio\Movies recommend\ml-25m\movies.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Replace ( and ) with nothing in column title
movies["title"] = (movies["title"].str.replace("(", "")).str.replace(")", "")

# Trim column title
movies["title"] = movies["title"].str.strip()

# Clean genres
movies["genres"] = (((movies["genres"].str.replace("|", " ")).str.replace("Sci-Fi", "SciFi")).str.replace("(no genres listed)", "no")).str.replace("Film-Noir", "FilmNoir")

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji 1995,Adventure Children Fantasy
2,3,Grumpier Old Men 1995,Comedy Romance
3,4,Waiting to Exhale 1995,Comedy Drama Romance
4,5,Father of the Bride Part II 1995,Comedy
...,...,...,...
62418,209157,We 2018,Drama
62419,209159,Window of the Soul 2001,Documentary
62420,209163,Bad Poems 2018,Comedy Drama
62421,209169,A Girl Thing 2001,no


---

# Create Tf-Idf vector for movies

In [6]:
# Create TfidfVectorizer for movies
vectorizer = TfidfVectorizer(ngram_range = (1,2))

# Create vector for all movie name
movies_vector = vectorizer.fit_transform(movies["title"])

In [7]:
title = "Men"
searching_vector = vectorizer.transform([title])
similarity = cosine_similarity(searching_vector, movies_vector).flatten()
indices = np.argsort(similarity)[-5:]
result = movies.iloc[indices][::-1]

In [8]:
result

Unnamed: 0,movieId,title,genres
3692,3793,X-Men 2000,Action Adventure SciFi
28489,131824,Men... 1985,Comedy
1126,1154,T-Men 1947,FilmNoir
11003,47484,G Men 1935,Crime Drama
7071,7196,"Men, The 1950",Drama


In [9]:
# Create search function
def search(title):
    searching_vector = vectorizer.transform([title])
    similarity = cosine_similarity(searching_vector, movies_vector).flatten()
    indices = np.argsort(similarity)[-5:]
    result = movies.iloc[indices][::-1]
    return result

---

# Creat search box

In [10]:
# Create search box
movie_search_box = widgets.Text(placeholder = "Enter movie name", description = "Movie", disabled = False)

# Create output widget
movie_list = widgets.Output()

def recommend_list(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))
            
movie_search_box.observe(recommend_list, names = "value")

display(movie_search_box, movie_list)

Text(value='', description='Movie', placeholder='Enter movie name')

Output()

---

# Create recommend system

## Similar user 

In [11]:
# Import movies rating
movies_rating = pd.read_csv(r"C:\Users\YourName\OneDrive\เดสก์ท็อป\Whatever\Language\Computer\Portfolio\Movies recommend\ml-25m\ratings.csv")

In [12]:
movies_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [13]:
movie_id = 88129

In [14]:
# Find similar user that like same movie as us
similar_user_like = movies_rating.loc[(movies_rating["movieId"] == movie_id) & (movies_rating["rating"] > 4), "userId"].unique()

similar_user_like

array([    37,     38,    153, ..., 162428, 162498, 162533], dtype=int64)

In [15]:
# Find the movies that they(similar user) also like
similar_user_rec = movies_rating.loc[(movies_rating["userId"].isin(similar_user_like)) & (movies_rating["rating"] > 4)]

similar_user_rec

Unnamed: 0,userId,movieId,rating,timestamp
5209,37,32,4.5,1456485765
5210,37,47,4.5,1456485246
5211,37,50,5.0,1456485085
5212,37,97,4.5,1456485407
5213,37,110,4.5,1456485236
...,...,...,...,...
24998854,162533,60069,4.5,1280919889
24998861,162533,67997,4.5,1280920712
24998876,162533,78499,4.5,1281405901
24998884,162533,81591,4.5,1297289876


In [16]:
# Find percentage of user that like movies
percent_rec = similar_user_rec["movieId"].value_counts() / similar_user_rec["userId"].nunique()

# Choose movies that more than 10% of similar user like
percent_rec = percent_rec[percent_rec > 0.1]

percent_rec

movieId
88129     1.000000
296       0.605942
2959      0.596474
79132     0.511590
58559     0.505387
            ...   
1387      0.101534
44665     0.101534
168250    0.101534
4306      0.101208
85414     0.100229
Name: count, Length: 243, dtype: float64

---

## Other user

In [17]:
# Find other user that also like movies
all_user = movies_rating.loc[(movies_rating["movieId"].isin(percent_rec.index)) & (movies_rating["rating"] > 4)]

all_user

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
23,1,3949,5.0,1147868678
29,1,4973,4.5,1147869080
37,1,6016,5.0,1147869090
41,1,6711,5.0,1147868622
...,...,...,...,...
25000077,162541,7147,4.5,1240952343
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484
25000086,162541,31658,4.5,1240953287


In [18]:
# Find percentage of other user that like movies
all_user_rec = all_user["movieId"].value_counts() / all_user["userId"].nunique()

all_user_rec

movieId
318       0.339560
296       0.282461
2571      0.242137
356       0.233437
593       0.224153
            ...   
93840     0.011854
139644    0.009035
96829     0.008989
55118     0.008036
81932     0.007767
Name: count, Length: 243, dtype: float64

---

## User like score

In [19]:
# Create score by use (similar user percentage) / (other user percentage)
rec_percentage = pd.concat([percent_rec, all_user_rec], axis = 1)
rec_percentage.columns = ["similar user", "all user"]
rec_percentage["percent recommend"] = rec_percentage["similar user"] / rec_percentage["all user"] 

rec_percentage.sort_values("percent recommend", ascending = False).head(50)

Unnamed: 0_level_0,similar user,all user,percent recommend
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
88129,1.0,0.020126,49.686908
139644,0.150833,0.009035,16.694801
115569,0.255632,0.017005,15.032786
81932,0.10382,0.007767,13.367544
55118,0.102187,0.008036,12.716273
104879,0.153444,0.012294,12.481479
112183,0.198498,0.016111,12.320408
51540,0.157689,0.013181,11.963498
96829,0.107085,0.008989,11.913235
176371,0.156709,0.013306,11.777638


---

## Genres score

In [20]:
# Create TfidfVectorizer for genres
vectorizer_genres = TfidfVectorizer()

In [21]:
def Genres_score(search_genres):
    genres_vector = vectorizer_genres.fit_transform(search_genres["genres"][0:])
    genres = search_genres.iloc[0]["genres"]
    searching_genres_vector = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(searching_genres_vector, genres_vector).flatten()
    indices = np.argsort(similarity)[-10:]
    result_genres = a.iloc[indices][::-1]
    
    genres_score = pd.DataFrame(data = similarity[indices][::-1] * 100, index = result_genres.index, columns = ["genres_score"])
    x = pd.concat([result_genres, genres_score], axis = 1)
    x["total score"] = x["score"] * x["genres_score"]
    return x.sort_values("total score", ascending = False)

---

# Create Movies recommend system

In [22]:
# Create search function
def search(title):
    searching_vector = vectorizer.transform([title])
    similarity = cosine_similarity(searching_vector, movies_vector).flatten()
    indices = np.argsort(similarity)[-5:]
    result = movies.iloc[indices][::-1]
    return result

In [23]:
def Genres_score(search_genres, num):
    genres_vector = vectorizer_genres.fit_transform(search_genres["genres"][0:])
    genres = search_genres.iloc[0]["genres"]
    searching_genres_vector = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(searching_genres_vector, genres_vector).flatten()
    indices = np.argsort(similarity)[- num:]
    result_genres = search_genres.iloc[indices][::-1]
    
    genres_score = pd.DataFrame(data = similarity[indices][::-1] * 100, index = result_genres.index, columns = ["genres_score"])
    
    y = pd.concat([result_genres, genres_score], axis = 1)
    y["total score"] = y["score"] * y["genres_score"]
    return y.sort_values("total score", ascending = False)

In [24]:
def recommend_movies(movie_id, num):
    similar_user_like = movies_rating.loc[(movies_rating["movieId"] == movie_id) & (movies_rating["rating"] > 4), "userId"].unique()
    similar_user_rec = movies_rating.loc[(movies_rating["userId"].isin(similar_user_like)) & (movies_rating["rating"] > 4)]
    
    percent_rec = similar_user_rec["movieId"].value_counts() / similar_user_rec["userId"].nunique()
    percent_rec = percent_rec[percent_rec > 0.1]
    
    all_user = movies_rating.loc[(movies_rating["movieId"].isin(percent_rec.index)) & (movies_rating["rating"] > 4)]
    all_user_rec = all_user["movieId"].value_counts() / all_user["userId"].nunique()
    
    rec_percentage = pd.concat([percent_rec, all_user_rec], axis = 1)
    rec_percentage.columns = ["similar user", "all user"]
    rec_percentage["score"] = rec_percentage["similar user"] / rec_percentage["all user"] 
    rec_percentage = rec_percentage.sort_values("score", ascending = False).head(num)
    
    rec_movies = Genres_score(rec_percentage.merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]], num)
    rec_movies = rec_movies[["title", "genres"]].reset_index(drop = True)
    rec_movies.index += 1
    
    return rec_movies 

In [25]:
# Create input widget
movie_input = widgets.Text(placeholder = "Enter movie and release year", description = "Movie tiltle :", disabled = False)
movies_num = widgets.IntSlider(value = 10, min = 10, max = 20, step = 1, description = "How many")

# Create output widget
movies_output = widgets.Output()

# Def funtion
def evaluate(movie, num):
    with movies_output:
        movies_output.clear_output()
        title = movie
        if len(title) >= 2:
            result = search(title)
            movie_id = result.iloc[0]["movieId"]
            movie_genres = result.iloc[0]["genres"]
            display(recommend_movies(movie_id, num))
            
show = interactive(evaluate, movie = movie_input, num = movies_num)

display(show, movies_output)

interactive(children=(Text(value='', description='Movie tiltle :', placeholder='Enter movie and release year')…

Output()