In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # a transformer that will transform text to numerics
from sklearn.metrics.pairwise import cosine_similarity # will return an array of values that shows the similarity between movies
import difflib # it will returned  most similar keywords from the list of all keywords

In [2]:
movies_df = pd.read_csv(r"C:\Users\Rotimi\Downloads\movies.csv")

In [4]:
pd.set_option("display.max_columns",5000)

In [5]:
movies_df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


### checking the shape of the Dataframe

In [42]:
movies_df.shape # 4803 rows and 24 columns

(4803, 24)

In [7]:
movies_df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

##  Data Preparation and Data preprocessing

Let's select the best  features to work with, these features will be the important features we need to identify from our feature extraction and they can help with identifying movies that are similar

In [8]:
select_features = ["genres", "keywords", "cast", "tagline", "director"]

Recall, This is a recommendation system algorithm. What we are interested in identyfing is how we can users recommend movies 
based on :
*1. movies they select , content*
*2.based on popularity*
*3.based on the group they identified with , the people they work with there.*
*4.The past record of movies they like, ideally--Netflix recommend movies to you based on the this strategy.*

## Dealing with missing values

As part of our Data preprocessing, our data will contain missing values , so lets with missing values from the selected columns

In [11]:
for col in select_features:
    movies_df[col] = movies_df[col].fillna("") 

Checking if there are stll missing values in the Data frame

In [23]:
movies_df[select_features].isna().sum() # There are no missing values in the  Dataframe

genres      0
keywords    0
cast        0
tagline     0
director    0
dtype: int64

### Combinning the columns in the Dataframe to text

In [9]:
# so we need to combine these columns together

In [34]:
combined_features = movies_df["genres"] + " " + movies_df["keywords"] + " " +movies_df["cast"] + " "+ movies_df["tagline"] + movies_df["director"]

We have the combined columns in text , Next is to convert it into numeric using the TfidVectorizer transformer

vectorizer = TfidfVectorizer()
vectorized_features = vectorizer.fit_transform(combined_features)

What we do next , is to use our cosine_similarity to get our confidence intervals in the movie. ie to get the similarity values between the movies 

In [72]:
# The similarity value is computed for all 4803 movies individually , hence the reason for 4803 * 4803 shape.

similarity_values = cosine_similarity(vectorized_features)

So we have the similarity computed for all movies, lets store all the names of movies in a column . This will help 
us identify movies in the dataset and help select the closest one selected by users

In [50]:
movies_list = movies_df["title"].to_list()

From the user's part, lets define a function that will take input as movie from users ,the library  difflib  will help return similary movies based on the one enter by user

In [56]:
subscriber_movie = input("what is the name of the movie you will like to watch ? ")

what is the name of the movie you will like to watch ? iron man


In [61]:
close_match = difflib.get_close_matches(subscriber_movie , movies_list)
closest_match = close_match[0]

In [64]:
#Lets find, the index which that movie is located in the dataframe
index_movie = movies_df[movies_df["title"] == closest_match]["index"].values[0] 

In [76]:
# This will returned all the similarity values of movies (a list of tuples with firs value index location and second the value itself) sorted by the index

subscriber_similarity_value = list(enumerate(similarity_values[index_movie])) # To select the similarity value of the movie in index 68 compared to others

In [83]:
# we can sort it based on the similarity value itself, this will returned the list in an reverse othe, the highest similarity movi
#will come first

sorted_similarity_value = sorted(subscriber_similarity_value, key = lambda tup : tup[1] , reverse = True)

In [92]:
print("Movies recommended for you are :", "\n")

ind = 0

for similarity_score in sorted_similarity_value:
    title_of_movie = movies_df[movies_df["index"] == similarity_score[0]]["title"].values[0]
    if ind < 20 : 
        print(ind , title_of_movie)
        ind = ind + 1

Movies recommended for you are : 

0 Iron Man
1 Iron Man 2
2 Iron Man 3
3 Avengers: Age of Ultron
4 The Avengers
5 Captain America: Civil War
6 Captain America: The Winter Soldier
7 Ant-Man
8 X-Men
9 Made
10 X-Men: Apocalypse
11 X2
12 The Incredible Hulk
13 The Helix... Loaded
14 X-Men: First Class
15 X-Men: Days of Future Past
16 Guardians of the Galaxy
17 Kick-Ass 2
18 Deadpool
19 Thor: The Dark World


In [88]:
# We can also use a function to help us print out the movies name

def select_first_20():
    movie_count = 0
    
    for ind_value in sorted_similarity_value:
        if movie_count < 20 :
            print(movies_list[ind_value[0]])
            movie_count = movie_count + 1

In [89]:
select_first_20()

Iron Man
Iron Man 2
Iron Man 3
Avengers: Age of Ultron
The Avengers
Captain America: Civil War
Captain America: The Winter Soldier
Ant-Man
X-Men
Made
X-Men: Apocalypse
X2
The Incredible Hulk
The Helix... Loaded
X-Men: First Class
X-Men: Days of Future Past
Guardians of the Galaxy
Kick-Ass 2
Deadpool
Thor: The Dark World


In [90]:
# Now lets build a pipeline that takess movies name  from user and return list of movies based on the similarity scores 
# of movie

In [None]:
subscriber_movie = input("what is the name of the movie you will like to watch ? ")
close_match = difflib.get_close_matches(subscriber_movie , movies_list)
closest_match = close_match[0]
index_movie = movies_df[movies_df["title"] == closest_match]["index"].values[0] 
subscriber_similarity_value = list(enumerate(similarity_values[index_movie]))
sorted_similarity_value = sorted(subscriber_similarity_value, key = lambda tup : tup[1] , reverse = True)
print("Movies recommended for you are :", "\n")

ind = 0

for similarity_score in sorted_similarity_value:
    title_of_movie = movies_df[movies_df["index"] == similarity_score[0]]["title"].values[0]
    if ind < 20 : 
        print(ind , title_of_movie)
        ind = ind + 1

Now , Lets also include a try and except code that will take care of movies whose keywords are not  closer to the movies in the dataset?

In [120]:
subscriber_movie = input("what is the name of the movie you will like to watch ? ")
close_match = difflib.get_close_matches(subscriber_movie , movies_list)
try :
    closest_match = close_match[0]
    
except IndexError :
    closest_match = "Avatar"


index_movie = movies_df[movies_df["title"] == closest_match]["index"].values[0] 
subscriber_similarity_value = list(enumerate(similarity_values[index_movie]))
sorted_similarity_value = sorted(subscriber_similarity_value, key = lambda tup : tup[1] , reverse = True)
print("Movies recommended for you are :", "\n")

ind = 0

for similarity_score in sorted_similarity_value:
    title_of_movie = movies_df[movies_df["index"] == similarity_score[0]]["title"].values[0]
    if ind < 20 : 
        print(ind , title_of_movie)
        ind = ind + 1

what is the name of the movie you will like to watch ? world war
Movies recommended for you are : 

0 World War Z
1 The Sum of All Fears
2 Land of the Dead
3 Sabotage
4 The Helix... Loaded
5 Re-Kill
6 Warm Bodies
7 Maggie
8 If I Stay
9 Shame
10 No Escape
11 The Walk
12 And So It Goes
13 The Lone Ranger
14 13 Hours: The Secret Soldiers of Benghazi
15 Halloween II
16 The Last Days on Mars
17 Resident Evil
18 Superman IV: The Quest for Peace
19 Everything Put Together
