In [None]:
!pip install pandas scikit-learn nltk

In [None]:
import pandas as pd
import numpy as np
import ast
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import gzip

In [None]:
credits_url = 'https://raw.githubusercontent.com/spoluan/tmdb-5000-movie-recommendation-system/hybrid-approach/tmdb-5000/tmdb_5000_credits.csv'
movies_url = 'https://raw.githubusercontent.com/spoluan/tmdb-5000-movie-recommendation-system/hybrid-approach/tmdb-5000/tmdb_5000_movies.csv'
credits = pd.read_csv(credits_url)
movies_df = pd.read_csv(movies_url)

In [None]:
movies = movies_df.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)

In [None]:
def convert(obj):
    L = []
    try:
        for i in ast.literal_eval(obj):
            L.append(i['name'])
    except (ValueError, SyntaxError):
        pass
    return L

def convert3(obj):
    L = []
    counter = 0
    try:
        for i in ast.literal_eval(obj):
            if counter != 3:
                L.append(i['name'])
                counter += 1
            else:
                break
    except (ValueError, SyntaxError):
        pass
    return L

def fetch_director(obj):
    L = []
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
    except (ValueError, SyntaxError):
        pass
    return L

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ', '') for i in x])

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id', 'title', 'tags']].copy()
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [None]:
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return ' '.join(y)
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
with gzip.open('movie_list.pkl', 'wb') as f:
    pickle.dump(new_df.to_dict(), f)
with gzip.open('similarity.pkl', 'wb') as f:
    pickle.dump(similarity, f)