In [None]:
import pandas as pd      # for dataframes
import numpy as np       # for numerical operations
import matplotlib.pyplot as plt  # for plotting


In [None]:
movies=pd.read_csv('data/movies.csv')
credits=pd.read_csv('data/credits.csv')

In [None]:
movies=movies.merge(credits,on='title')

In [None]:
movies['original_language'].value_counts()


In [None]:
movies=movies[['movie_id','title','overview','crew','genres','keywords','cast','tagline']]

In [None]:
movies.dropna(inplace=True)

In [None]:
movies.duplicated().sum

In [None]:
import ast
def convert(text):
    L=[]
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

In [None]:
movies['genres']=movies['genres'].apply(convert)

In [None]:
movies['keywords']=movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:
import ast

def convert_cast(text):
    L = []
    try:
        for idx, i in enumerate(ast.literal_eval(text)):
            if idx < 3:  # take only first 3 cast members
                L.append(i['name'])
            else:
                break
    except (ValueError, SyntaxError):
        pass  # in case text is NaN or malformed
    return L


In [None]:
movies['cast']=movies['cast'].apply(convert_cast)

movies.head()

In [None]:
import ast

def fetch_director(text):
    try:
        crew_list = ast.literal_eval(text)  # convert string to list of dicts
        for member in crew_list:
            if member.get('job') == 'Director':
                return member['name']
        return None  # if no director found
    except (ValueError, SyntaxError, TypeError):
        return None  # handle NaN or malformed rows safely

movies.head()

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)



In [None]:
movies.iloc[0]['overview']

In [None]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [None]:
def remove_spaces(words):
    l=[]
    for i in words:
        l.append(i.replace(" ",""))
    return l

In [None]:
movies[['title','cast','crew','genres','keywords']].head()


In [None]:
def remove_spaces_list(words):
    if isinstance(words, list):
        return [w.replace(" ", "") for w in words]
    return []

def remove_spaces_string(word):
    if isinstance(word, str):
        return word.replace(" ", "")
    return ""

movies['cast'] = movies['cast'].apply(remove_spaces_list)
movies['crew'] = movies['crew'].apply(remove_spaces_string)
movies['genres'] = movies['genres'].apply(remove_spaces_list)
movies['keywords'] = movies['keywords'].apply(remove_spaces_list)
movies.head() 


In [None]:
movies['tags'] = movies['overview'] + movies['genres'] +movies['cast'] +  movies['keywords'] + movies['crew'].apply(lambda x: [x])

# Check result
movies.head()


In [None]:
df=movies[['movie_id','title','tags']]
df.head()


In [None]:
df['tags']=df['tags'].apply(lambda x:" ".join(x))


In [None]:
df['tags']=df['tags'].apply(lambda x:x.lower())
df.iloc[0]['tags']

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
df['tags']=df['tags'].apply(stem)

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(df['tags']).toarray()
similarity = cosine_similarity(vectors)


In [102]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)
similarity
similarity.shape


(3965, 3965)

In [103]:
def recommend(movie):
    movie_index=df[df['title']==movie].index[0]
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(df.iloc[i[0]].title)
 

In [105]:
recommend('The Dark Knight Rises')


The Dark Knight
Batman Returns
Batman Forever
Batman Begins
Batman


In [106]:
import pickle
pickle.dump(df,open('artifacts/movies.pkl','wb'))
pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))