In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('dataset/data_movies.csv')
credits = pd.read_csv('dataset/data_credits.csv')

In [3]:
movies.head()
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
#renaming of feature in movie data frame so that both dataframe has same column name and then we can merge them 
movies = movies.rename(columns={'id': 'movie_id'})

In [5]:
# Merging movies and credit data on basis of movie_id (common feature in both dfs)

movies = movies.merge(credits, on = 'movie_id')

In [6]:
# drop that features that will not helpful in creating tags for recommender system
# use ony imp features that will help  eg:
# genres, 
# movie_id (**as primary key**)
# keywords, 
# overview,
# original_title,
# cast,
# crew,


movies = movies[['genres', 'movie_id', 'keywords', 'overview', 'original_title', 'cast', 'crew']]
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   genres          4803 non-null   object
 1   movie_id        4803 non-null   int64 
 2   keywords        4803 non-null   object
 3   overview        4800 non-null   object
 4   original_title  4803 non-null   object
 5   cast            4803 non-null   object
 6   crew            4803 non-null   object
dtypes: int64(1), object(6)
memory usage: 262.8+ KB


In [7]:
# only 3 missing values found in overview feature, its not a big number and we don't have their overview values so drop them 
movies.dropna(inplace = True)

In [8]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [9]:
# genres feature is in string format, we have to convert it into list of dictionaries
# ast.literal_eval() will convert string to list of dictionaries

import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [10]:
#convert function will convert string to list of dictionaries
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [11]:
#write comments for the below code
movies['genres'] = movies.genres.apply(convert)

In [12]:
movies.keywords = movies.keywords.apply(convert)

In [13]:
def convert_cast(obj):
    List = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            List.append(i['name'])
            counter+=1
        else:
            break
    return List

In [14]:
movies.cast = movies.cast.apply(convert_cast)

In [15]:
def fetch_director(text):
    List = []

    for i in ast.literal_eval(text):
        if i['job'] =='Director':
            List.append(i['name'])
            break
    return List
        

In [16]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [17]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [18]:
# Remove spaces in genres,cast,crew. like Science Fiction -> ScienceFiction
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])

In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
new_df = movies[['movie_id', 'original_title', 'tags']]

In [21]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [22]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [23]:
import nltk

In [24]:
#apply stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [25]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [26]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english' )

In [28]:
vectors  = cv.fit_transform(new_df['tags']).toarray()

In [29]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [30]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
similarity = cosine_similarity(vectors)

In [33]:
sorted(list(enumerate(similarity[0])),reverse=True, key = lambda x : x[1]) [1:6]

[(1213, 0.28676966733820225),
 (2403, 0.26901379342448517),
 (3723, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [34]:
def recommend(movie):
    movie_index = new_df[new_df['original_title']== movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key = lambda x: x[1])[1:6]
    # print ((movie_list))
    for i in movie_list:
        print(new_df.iloc[i[0]].original_title)
    # return 

In [35]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [36]:
recommend('Batman Begins')

The Dark Knight
Batman
The Dark Knight Rises
10th & Wolf
Rockaway


In [37]:
new_df.original_title.head(20)

0                                          Avatar
1        Pirates of the Caribbean: At World's End
2                                         Spectre
3                           The Dark Knight Rises
4                                     John Carter
5                                    Spider-Man 3
6                                         Tangled
7                         Avengers: Age of Ultron
8          Harry Potter and the Half-Blood Prince
9              Batman v Superman: Dawn of Justice
10                               Superman Returns
11                              Quantum of Solace
12     Pirates of the Caribbean: Dead Man's Chest
13                                The Lone Ranger
14                                   Man of Steel
15       The Chronicles of Narnia: Prince Caspian
16                                   The Avengers
17    Pirates of the Caribbean: On Stranger Tides
18                                 Men in Black 3
19      The Hobbit: The Battle of the Five Armies


In [38]:
recommend("The Avengers")

Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
Captain America: The First Avenger
Iron Man


In [39]:
import pickle

In [40]:
pickle.dump(new_df, open('movies.pkl','wb'))

In [41]:
pickle.dump(similarity, open('similarity.pkl','wb'))

In [42]:
new_df.head(10)

Unnamed: 0,movie_id,original_title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
5,559,Spider-Man 3,the seemingli invinc spider-man goe up against...
6,38757,Tangled,when the kingdom' most wanted-and most charmin...
7,99861,Avengers: Age of Ultron,when toni stark tri to jumpstart a dormant pea...
8,767,Harry Potter and the Half-Blood Prince,"as harri begin hi sixth year at hogwarts, he d..."
9,209112,Batman v Superman: Dawn of Justice,fear the action of a god-lik super hero left u...


In [43]:
pickle.dump(similarity, open('similarity.pkl','wb'))