### Movie Data Analysis

The dataset has 3 columns: User, Movie and Rating.

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
movies = pd.read_csv("data.txt")
movies

Unnamed: 0,User,Movie,Rating
0,Alice,Star Wars,5
1,Frank,The Godfather,4
2,Bob,Titanic,5
3,Carol,The Matrix,3
4,Dave,Inception,2
5,Emily,Pulp Fiction,4
6,Quincy,Star Wars,5
7,Bob,Star Wars,2
8,Frank,Forrest Gump,3
9,Alice,The Matrix,4


### Data types for columns

In [4]:
movies.dtypes

User      object
Movie     object
Rating    object
dtype: object

In [17]:
user_data = pd.DataFrame(movies.groupby(["User", "Movie"])["Rating"].first())
user_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating
User,Movie,Unnamed: 2_level_1
Alice,Inception,3
Alice,Star Wars,5
Alice,The Matrix,4
Bob,Star Wars,2
Bob,The Godfather,4
Bob,Titanic,5
Carol,Inception,5
Carol,Pulp Fiction,4
Carol,The Matrix,3
Dave,Forrest Gump,5


### Data Cleaning
On the "Rating" column, there is inconsistency with the data, therefore we have to ensure the values are consistent.
1. We will create a new column called "Clean_Rating" with cleaned data.
2. Remove characters from "Clean_Rating" column.
3. Remove user "Victor", to remove "noise" from the data. None of his ratings are >=0.
4. Fill NaN values, convert column data type to float  and reset index.

In [18]:
movies["Clean_Rating"] = movies["Rating"].str.replace("Five", "5", regex=True)
movies["Clean_Rating"] = movies["Clean_Rating"].str.replace(r'[^0-9,.]', '', regex=True)
movies = movies[movies["User"] != "Victor"]
movies["Clean_Rating"] = movies["Clean_Rating"].fillna(0)
movies["Clean_Rating"] = movies["Clean_Rating"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["Clean_Rating"] = movies["Clean_Rating"].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["Clean_Rating"] = movies["Clean_Rating"].astype(float)


In [19]:
movies = movies.reset_index(drop=True)

In [20]:
movies

Unnamed: 0,User,Movie,Rating,Clean_Rating
0,Alice,Star Wars,5,5.0
1,Frank,The Godfather,4,4.0
2,Bob,Titanic,5,5.0
3,Carol,The Matrix,3,3.0
4,Dave,Inception,2,2.0
5,Emily,Pulp Fiction,4,4.0
6,Quincy,Star Wars,5,5.0
7,Bob,Star Wars,2,2.0
8,Frank,Forrest Gump,3,3.0
9,Alice,The Matrix,4,4.0


### Create a search algorithm for movies and users


In [34]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["Movie"])

In [35]:
def search_movie(title):
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    movie_results = movies.iloc[indices][::-1]
    return movie_results

In [36]:
search_movie("star wars")

Unnamed: 0,User,Movie,Rating,Clean_Rating
55,Zane,Star Wars,4,4.0
23,Frank,Star Wars,5,5.0
30,Nina,Star Wars,5,5.0
32,Sarah,Star Wars,5x,5.0
6,Quincy,Star Wars,5,5.0


In [49]:
vectorizer2 = TfidfVectorizer(ngram_range=(1,2))
tfidf2 = vectorizer2.fit_transform(movies["User"])

In [50]:
def search_user(user):
    query_vec2 = vectorizer2.transform([user])
    similarity = cosine_similarity(query_vec2, tfidf2).flatten()
    indices = np.argpartition(similarity, -5)[-10:]
    user_results = movies.iloc[indices][::-1]
    return user_results

In [51]:
search_user("zane")

Unnamed: 0,User,Movie,Rating,Clean_Rating
55,Zane,Star Wars,4.0,4.0
47,Zane,Forrest Gump,3.5,3.5
42,Zane,Pulp Fiction,3.5,3.5
22,Bob,The Godfather,4.0,4.0
17,Ivy,Star Wars,5.0,5.0
18,Emily,Forrest Gump,3.0,3.0
16,Carol,Pulp Fiction,4.0,4.0
15,Paul,The Godfather,5.0,5.0
23,Frank,Star Wars,5.0,5.0
19,Quincy,The Matrix,4.0,4.0


In [27]:
def user_recommended_movies(user):
    # user_2 = "Frank
    user_watching = movies[(movies["User"] == user) & (movies["Clean_Rating"] >= 3)]
    ## Similar users with ratings over 3 for same movies user likes
    users_watching_similar_movies = movies[(movies["Movie"].isin(user_watching["Movie"])) & (movies["Clean_Rating"] >= 3)]
    # ## We have to find other movies these users have liked that have ratings of >= 3
    similar_user_recs = movies[(movies["User"].isin(users_watching_similar_movies["User"])) & (movies["Clean_Rating"] >= 3)][["Movie", "User"]]
    user_recs = similar_user_recs[(similar_user_recs["User"] != user)]["Movie"].unique()
    # similar_user_recs
    user_recs_df = pd.DataFrame(user_recs.tolist(), columns=["'s Recommended Movies"])
    user_recs_df = user_recs_df.add_prefix(user)
    
    return user_recs_df

In [28]:
user_recommended_movies("Zane")

Unnamed: 0,Zane's Recommended Movies
0,Star Wars
1,The Godfather
2,The Matrix
3,Pulp Fiction
4,Forrest Gump
5,Inception
6,Titanic


In [41]:
def find_similar_movies(movie):
    #Finding recommendations from similar users
    similar_users = movies[(movies["Movie"] == movie) & (movies["Clean_Rating"] >= 3)]["User"].unique()
    similar_user_recs = movies[(movies["User"].isin(similar_users)) & (movies["Clean_Rating"] >= 3)]["Movie"]
    
    #Only 10% of users
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    movie_recs_df = pd.DataFrame(similar_user_recs)
    movie_recs_df = movie_recs_df.drop(columns="count")
    movie_recs_df.reset_index(inplace=True)
    movie_recs_df = movie_recs_df.rename(columns = {'Movie':'Movies similar to '})
    movie_recs_df = movie_recs_df.add_suffix(movie)
    
    return movie_recs_df[1:]

In [42]:
find_similar_movies("The Godfather")

Unnamed: 0,Movies similar to The Godfather
1,Forrest Gump
2,Titanic
3,Pulp Fiction
4,Star Wars


In [38]:
import ipywidgets as widgets
from IPython.display import display

movie_input_name = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search_movie(title)
            movie = results.iloc[0]["Movie"]
            display(find_similar_movies(movie))

movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [52]:
user_input_name = widgets.Text(
    value="User Name",
    description="Name of User:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        user = data["new"]
        if len(user) > 2:
            results = search_user(user)
            user = results.iloc[0]["User"]
            display(user_recommended_movies(user))

user_input_name.observe(on_type, names="value")

display(user_input_name, recommendation_list)

Text(value='User Name', description='Name of User:')

Output()