In [2]:
import pandas as pd 
import ast
from ast import literal_eval 
import nltk
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


movies = pd.read_csv(r"tmdb_5000_movies.csv")
credits = pd.read_csv(r"tmdb_5000_credits.csv")
movies = movies.drop(columns = ['budget','homepage','original_title','status','production_companies','production_countries','tagline','spoken_languages','vote_average','vote_average'])
movies.sample(1)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,release_date,revenue,runtime,title,vote_count
1247,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name...",13536,"[{""id"": 9826, ""name"": ""murder""}, {""id"": 180668...",en,A man struggling to come to terms with the sin...,3.476966,2002-09-06,22433915,108.0,City By The Sea,92


In [3]:
## merging movies with credits based on title 
movies = movies.merge(credits, on='title')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genres             4809 non-null   object 
 1   id                 4809 non-null   int64  
 2   keywords           4809 non-null   object 
 3   original_language  4809 non-null   object 
 4   overview           4806 non-null   object 
 5   popularity         4809 non-null   float64
 6   release_date       4808 non-null   object 
 7   revenue            4809 non-null   int64  
 8   runtime            4807 non-null   float64
 9   title              4809 non-null   object 
 10  vote_count         4809 non-null   int64  
 11  movie_id           4809 non-null   int64  
 12  cast               4809 non-null   object 
 13  crew               4809 non-null   object 
dtypes: float64(2), int64(4), object(8)
memory usage: 526.1+ KB


In [4]:
# removing unnnecessary cols
movies =movies.drop(columns=['vote_count','movie_id'])

In [5]:
## removing nan inputs
movies.dropna(inplace = True)
movies.isnull().sum()


genres               0
id                   0
keywords             0
original_language    0
overview             0
popularity           0
release_date         0
revenue              0
runtime              0
title                0
cast                 0
crew                 0
dtype: int64

In [6]:
movies.sample(1)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,release_date,revenue,runtime,title,cast,crew
1790,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 53, ""nam...",11460,"[{""id"": 1562, ""name"": ""hostage""}, {""id"": 2250,...",en,After attending the funeral of her grandmother...,8.802626,2005-08-04,57891803,85.0,Red Eye,"[{""cast_id"": 1, ""character"": ""Lisa Reisert"", ""...","[{""credit_id"": ""52fe44449251416c7502eb99"", ""de..."


# Functions to extract names of cast, Director and to remove sapces 

In [7]:
def extract_cast_names(obj):
    if pd.isna(obj):
        return[]
    l = [] 
    counter =0
    for i in ast.literal_eval(obj):#it converts a string that looks like a Python object into the real object safely.
        if counter == 3: return l
        l.append(i['name'])
        counter += 1 
    return l

## to fetch director 
def extract_Director_names(obj):
    if pd.isna(obj):
        return[]
    l = [] 
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i['name'])
    return l

## to Remove spaces form strings in the columns 
def rem_spaces(lst):
    if not isinstance(lst, list):
        return []
    return [i.replace(" ", "") for i in lst]



In [8]:
movies['cast'] = movies['cast'].apply(extract_cast_names)
movies['crew'] =movies['crew'].apply(extract_Director_names)
movies.sample(1)

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,release_date,revenue,runtime,title,cast,crew
297,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 53, ""name...",1372,"[{""id"": 526, ""name"": ""rebel""}, {""id"": 736, ""na...",en,An ex-mercenary turned smuggler. A Mende fishe...,52.792678,2006-12-07,170877916,143.0,Blood Diamond,"[Leonardo DiCaprio, Djimon Hounsou, Jennifer C...",[Edward Zwick]


In [9]:
movies['keywords'] = movies['keywords'].apply(extract_cast_names)
movies['genres'] = movies['genres'].apply(extract_cast_names)
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [10]:
movies.head()

Unnamed: 0,genres,id,keywords,original_language,overview,popularity,release_date,revenue,runtime,title,cast,crew
0,"[Action, Adventure, Fantasy]",19995,"[culture clash, future, space war]",en,"[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,2009-12-10,2787965087,162.0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island]",en,"[Captain, Barbossa,, long, believed, to, be, d...",139.082615,2007-05-19,961000000,169.0,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent]",en,"[A, cryptic, message, from, Bond’s, past, send...",107.376788,2015-10-26,880674609,148.0,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,"[Action, Crime, Drama]",49026,"[dc comics, crime fighter, terrorist]",en,"[Following, the, death, of, District, Attorney...",112.31295,2012-07-16,1084939099,165.0,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion]",en,"[John, Carter, is, a, war-weary,, former, mili...",43.926995,2012-03-07,284139100,132.0,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [11]:
## making a new column named tags by adding other columns 
movies['tags'] = movies['overview']+movies['genres'] + movies['cast'] +movies['crew']

In [12]:
## dropping other columns and making a new df
df = movies[['id','title','tags']]
df['tags'] =df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] =df['tags'].apply(lambda x: " ".join(x))


In [13]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y) # returns a sting

## Stemming to remove extra similar words like Love, Loving, Loved to Love,Love,Love and etc

In [14]:

df['tags'] = df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


In [15]:
df['tags']= df['tags'].apply(lambda x:x.lower()).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags']= df['tags'].apply(lambda x:x.lower()).copy()


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000,stop_words = 'english')
vectors =cv.fit_transform(df['tags']).toarray()



In [17]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(5000,))

In [18]:
len(cv.get_feature_names_out())

5000

In [19]:
## cosine similarity
similarity = cosine_similarity(vectors)
similarity[0]


array([1.        , 0.12510865, 0.11420805, ..., 0.        , 0.05839572,
       0.        ], shape=(4805,))

In [25]:
def recommend(movie):
    if movie not in df['title'].values:## if movies is not in the dataset
        print("Movie not found")
        return

    movie_idx = df[df['title'] == movie].index[0]
    distances = similarity[movie_idx]
    movies_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    for i in movies_list:
        print(df.iloc[i[0]]['title'])

In [27]:
pickle.dump(df,open('movies.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [26]:
recommend('Batman')

Batman
The R.M.
Big Fish
Curse of the Golden Flower
Code of Honor
