In [None]:
import numpy as np
import pandas as pd

In [None]:
# ignore possible warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
movies = pd.read_csv('datasets/movies.csv')
credits = pd.read_csv('datasets/credits.csv')

In [None]:
movies.head(1)

In [None]:
credits.head(1)

In [None]:
credits.head(1)['cast'].values

credits.head(1)['crew'].values

In [None]:
movies_db = movies.merge(credits, on = 'title')

In [None]:
movies_db.head(1)

In [None]:
movies_db.columns

In [None]:
# we are keeping the following:

# genres
# id
# keywords
# overview
# title
# cast
# crew

movies_final_db = movies_db[['id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies_final_db.head(1)

In [None]:
movies_final_db.isnull().sum()

In [None]:
#drop rows with null 
movies_final_db.dropna(inplace=True)

In [None]:
movies_final_db.isnull().sum()

In [None]:
movies_final_db.duplicated().sum()

In [None]:
movies_final_db.iloc[0].genres

In [None]:
import ast

In [None]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L 

In [None]:
movies_final_db['genres'] = movies_final_db['genres'].apply(convert)
movies_final_db['keywords'] = movies_final_db['keywords'].apply(convert)
movies_final_db['cast'] = movies_final_db['cast'].apply(convert)

In [None]:
movies_final_db.head(1)

In [None]:
movies_final_db['crew'][0]

In [None]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L 

movies_final_db['crew'] = movies_final_db['crew'].apply(fetch_director)
# movies_final_db['crew'].apply(fetch_director)

In [None]:
movies_final_db.head(1)

In [None]:
#converting the overview (string) into list
movies_final_db['overview'].apply(lambda x : x.split())

In [None]:
movies_final_db['overview'] = movies_final_db['overview'].apply(lambda x : x.split())

In [None]:
movies_final_db.head(1)

In [None]:
movies_final_db['genres'] = movies_final_db['genres'].apply(lambda x:[i.replace(" " , "") for i in x])
movies_final_db['keywords'] = movies_final_db['keywords'].apply(lambda x:[i.replace(" " , "") for i in x])
movies_final_db['cast'] = movies_final_db['cast'].apply(lambda x:[i.replace(" " , "") for i in x])
movies_final_db['crew'] = movies_final_db['crew'].apply(lambda x:[i.replace(" " , "") for i in x])

In [None]:
movies_final_db.head(1)

In [None]:
# concate columns (overview, genres,keywords,cast,crew) into new 'tags' column
movies_final_db['tags'] = movies_final_db['overview'] + movies_final_db['genres'] + movies_final_db['keywords'] + movies_final_db['cast'] + movies_final_db['crew']

In [None]:
movies_final_db.head(1)

In [None]:
movies_final_db['tags'][0]

### new dataframe

In [None]:
new_df = movies_final_db[['id','title','tags']]

In [None]:
new_df

In [None]:
new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df.head()

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags'].apply(lambda x: x.lower())

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [None]:
new_df.head()

# applying steming

In [None]:
import nltk

In [None]:
# !pip install nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [None]:
new_df['tags'].apply(stem)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

### text vectorization scikitlearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features = 6000, stop_words=True)
cv = CountVectorizer(max_features = 5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
vectors.shape
vectors

In [None]:
#most frequent 5000 words
cv.get_feature_names()
# len(cv.get_feature_names())

### cosine_similarity scikitlearn

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(vectors)

In [None]:
cosine_similarity(vectors).shape

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    recommended_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]
    
    for i in recommended_list:
        print(new_df.iloc[i[0]].title)
#         print(i[0])

In [None]:
recommend('Avatar')
# recommend('Fight Club')

### exporting movies names to website

In [None]:
import pickle

In [None]:
pickle.dump(new_df,open('movies.pkl','wb'))

In [None]:
#exporting as a dictionary
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))

In [None]:
## dumping similarity
pickle.dump(similarity,open('similarity.pkl','wb'))