## Importing required libraries

In [136]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pandas_profiling import ProfileReport
import re 
import warnings
warnings.filterwarnings("ignore")

## Loading and viewing data

In [137]:
met_small = pd.read_csv('Data/movies_metadata.csv')

In [138]:
met_small.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Explantory Data Analysis

In [139]:
# Extract the columns of interest and replace the NaN with blank string

movies_data = met_small[['title','genres','overview','production_companies','original_language','status','vote_average']].replace(np.nan, '', regex=True)

In [140]:
#ProfileReport(movie_dataset)

## Data Cleaning

In [169]:
#Remove movies with less than 5 imdb rating and langauges other than english and hindi

#First replace Empty values "" with 0
movies_data['vote_average']=movies_data['vote_average'].replace("",0)

#movies with rating more than 5 will be included in the list
movie_more5 = movies_data[movies_data['vote_average']>5]

#Only released movies to be included in the list
movie_rel = movie_more5[movie_more5['status']=='Released']

#Convert the title to lowercase
movie_rel['title'] = movie_rel['title'].str.lower()

#Reset index after removal of rows
movie_dataset = movie_rel.reset_index().drop(columns='index',axis=1)

In [170]:
#Define a function to combine all strings

def combine_features(row):
    try:
        return row['title'] +2*(" "+row["genres"])+" "+row["overview"]+" "+row["production_companies"]+3*(" "+row["original_language"])
    except:
        print("Error:",row)

In [171]:
#Combine title, genre, overview and production companies
#Thus a user will be recommended movies that are similar in genre, overiew and production company

movie_dataset["combined_features"] = movie_dataset.apply(combine_features,axis=1)

#Convert the string to lower case
movie_dataset["combined_features"]= movie_dataset["combined_features"].str.lower()

#Lets view one of the combine feature
movie_dataset["combined_features"][0]

"toy story [{'id': 16, 'name': 'animation'}, {'id': 35, 'name': 'comedy'}, {'id': 10751, 'name': 'family'}] [{'id': 16, 'name': 'animation'}, {'id': 35, 'name': 'comedy'}, {'id': 10751, 'name': 'family'}] led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences. [{'name': 'pixar animation studios', 'id': 3}] en en en"

In [172]:
#Lets remove the unwanted parentheses, id, name, punctuation

movie_dataset.astype(str)['combined_features'].map(lambda x:  type(x))

replace_val = ["{'id'","'name':","'},","[","]","'",",","}]","{","}",".",":","'",'[0-9]','id','--']

for i in replace_val:
  movie_dataset["combined_features"] = movie_dataset["combined_features"].str.replace(i, '')

for i in range(len(movie_dataset)):
    movie_dataset["combined_features"][i] = re.sub(' +', ' ', movie_dataset["combined_features"][i]) 

movie_dataset["combined_features"][0]

'toy story animation comedy family animation comedy family led by woody andys toys live happily in his room until andys birthday brings buzz lightyear onto the scene afra of losing his place in andys heart woody plots against buzz but when circumstances separate buzz and woody from their owner the duo eventually learns to put ase their differences pixar animation studios en en en'

In [173]:
#Copy movie dataset to df
df= movie_dataset.copy(deep=True)

#Create a column named index
df['index'] = df.index

## Save the Pre=processed dataset

In [52]:
df.to_csv("Appdata/Movie_dataset.csv")

## Create model and predict

In [27]:
#Create count matrix from this new combined column

cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

#Compute the Cosine Similarity based on the count_matrix

cosine_sim = cosine_similarity(count_matrix) 
movie_user_likes = "Avatar"

In [15]:
#Helper functions to obtain index of movie from its title

def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [17]:
#Get index of this movie from its title

movie_index = get_index_from_title(movie_user_likes)

similar_movies =  list(enumerate(cosine_sim[movie_index]))


#Get a list of similar movies in descending order of similarity score

sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

# Print titles of first 5 movies
i=0
for element in sorted_similar_movies:
  if get_title_from_index(element[0])==movie_user_likes:
    pass
  else:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>5:
      break

Avatar 2
Dragonball Evolution
Rise of the Planet of the Apes
X-Men: The Last Stand
Eragon
X-Men Origins: Wolverine
