In [None]:
import numpy as np
import pandas as pd
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head(3)

In [None]:
credits.head(3)

In [None]:
movies=movies.merge(credits,left_on='id',right_on='movie_id')

In [None]:
movies.head(2)

In [None]:
movies.drop(columns=['budget','homepage','original_language','spoken_languages','original_title','popularity','release_date','runtime','status','tagline','title_y','vote_average','vote_count','revenue','production_countries','id'],inplace=True)

In [None]:
credits.rename(columns={'title_x': 'title'},inplace=True)

In [None]:
movies.info()

In [None]:
movies_buffer=movies.copy()

# data preprocessing

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
movies.isnull().sum()

 Extracting tags from the data :

In [None]:
import ast 
def genre_extract(obj):
    obj=ast.literal_eval(obj)#for removing the string format on an object
    return [i['name'].lower() for i in obj]

In [None]:
movies.genres=movies.genres.apply(genre_extract)

In [None]:
movies.keywords=movies.keywords.apply(genre_extract)

In [None]:
movies.production_companies=movies.production_companies.apply(genre_extract)

In [None]:
def cast_extract(obj):
    obj=ast.literal_eval(obj)
    return [obj[i]['character'].lower() for i in range(min(3,len(obj)))]#extracting cast of top 3 characters in the film

In [None]:
movies.cast=movies.cast.apply(cast_extract)

In [None]:
def crew_extract(obj):
    obj=ast.literal_eval(obj)
    return [i['name'].lower() for i in obj if i['job']=='Director']#only retaining the director in the crew list

In [None]:
movies.crew=movies.crew.apply(crew_extract)

In [None]:
movies.head()

In [None]:
y=lambda x:[i.replace(' ','') for i in x] #removing spaces in the names for better tag discrimination
movies.cast=movies.cast.apply(y)
movies.crew=movies.crew.apply(y)
movies.production_companies=movies.production_companies.apply(y)
movies.genres=movies.genres.apply(y)
movies.keywords=movies.keywords.apply(y)

In [None]:
movies.overview=movies.overview.apply(lambda x:x.split())

In [None]:
movies['tags']=movies.cast+movies.crew+movies.production_companies+movies.genres+movies.keywords+movies.overview

In [None]:
new_df=movies[['movie_id','title_x','tags']] #seperating the required data

In [None]:
new_df.tags=new_df.tags.apply(lambda x: (' '.join(x)).lower()) #converting the list of tags into corpus for further processing

In [None]:
new_df.to_csv('minni.csv',index=False)

In [None]:
from nltk.stem.porter import PorterStemmer #for converting all similiar meaning tags to one eg: adventures,adventure->adventure
import re
ps=PorterStemmer()
ps=PorterStemmer()
def transform_tag(tag):
    tag=re.sub('\W+',' ', tag)#to remove special characters if any
    return ' '.join(ps.stem(i) for i in tag.split())

In [None]:
transform_tag(new_df.iloc[0].tags) #verification purpose only

In [None]:
new_df.tags=new_df.tags.apply(transform_tag)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
transformer=CountVectorizer(max_features=5000,stop_words='english')#to pick top 5000 most repetitve tags
vector=transformer.fit_transform(new_df.tags).toarray()#2D array representing whether the tag is present in the movie or not!

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
sim=cosine_similarity(vector) #similiarity matrix representing similiarity between movies (ranging from 0 to 1)

In [None]:
def recommend(movie):
    ind=new_df[new_df.title_x==movie].index[0]
    similarity=sim[ind]
    x=sorted(list(enumerate(similarity)),reverse=True,key=lambda x:x[1])[1:6]#picking top 5 movies
    for i in x:
        print(new_df.iloc[i[0]].title_x)

# 

Saving the require components to access from the webpage.py (the launching code)

In [None]:
import pickle as pk
pk.dump(new_df.to_dict(),open('movies.pkl','wb'))#exporting the data and similiary matrix for access on webpage
pk.dump(sim,open('similiarity_matrix.pkl','wb'))

# 

# Thank You