In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os


In [None]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [None]:
movies.head(2)

In [None]:
credits.head(2)

In [None]:
movies.shape

In [None]:
credits.shape


In [None]:
#merging dataset
movies =movies.merge(credits,on = "title")

In [None]:
movies.head(2)

In [None]:
movies.shape

In [None]:
movies['original_language'].value_counts()

In [None]:
movies.columns

In [None]:
#selecting important columns only
movies =movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.head(2)

In [None]:
movies.shape

In [None]:
#checking missing value
movies.isnull().sum()

In [None]:
#drop missing values
movies.dropna(inplace=True)

In [None]:
movies.isnull().sum()

In [None]:
movies.shape

In [None]:
#checking duplicate values
movies.duplicated().sum()

In [None]:
# handle genres
movies.iloc[0]['genres']

In [None]:
import ast #for converting str to list

def convert(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
        
    return l

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies.head(2)

In [None]:
movies['keywords']= movies['keywords'].apply(convert)

In [None]:
movies.head(2)

In [None]:
movies.iloc[0]['cast']

In [None]:
def convert_cast(text):
    l=[]
    counter = 0
    for i in ast.literal_eval(text):
        if counter <3:
            l.append(i['name'])
        counter += 1
        
    return l

In [None]:
movies['cast']= movies['cast'].apply(convert_cast)
movies.head(2)

In [None]:
movies.iloc[0]['crew']

In [None]:
def fetch_director(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
        
    return l

In [None]:
movies['crew']= movies['crew'].apply(fetch_director)
movies.head(2)

In [None]:
# handle overview (converting to list)

movies.iloc[0]['overview']

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies.head(2)

In [None]:
# now removing space like that 
'Sam Worthington'
'SamWorthington'

def remove_space(word):
    l=[]
    for i in word:
        l.append(i.replace(" ",""))
    return l


In [None]:
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genre'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [None]:
movies.head(2)

In [None]:
# Concatinate all
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords']+ movies['cast']+movies['crew']

In [None]:
movies.head(2)

In [None]:
movies.iloc[0]['tags']

In [None]:
# droping those extra columns
new_df = movies[['movie_id','title','tags']]

In [None]:
new_df.head(2)

In [None]:
# Converting list to str
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df.head(2)

In [None]:
new_df.iloc[0]['tags']

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df.head(2)

In [None]:
new_df.iloc[0]['tags']

In [None]:
import nltk
from nltk.stem import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
def stems(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)

In [None]:
new_df['tags'] = new_df['tags'].apply(stems)

In [None]:
new_df.iloc[0]['tags']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [None]:
vector

In [None]:
vector.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

In [None]:
similarity.shape

In [None]:
new_df[new_df['title'] == 'Spider-Man'].index[0]

In [None]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key= lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('The Dark Knight Rises')

In [None]:
import pickle
pickle.dump(new_df,open('artifacts/movie_list.pkl','wb'))
pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))