In [2]:
import numpy as np 
import pandas as pd

In [3]:
movies = pd.read_csv(r".\TMDB 5000 Movie Dataset\tmdb_5000_movies.csv")
credits = pd.read_csv(r".\TMDB 5000 Movie Dataset\tmdb_5000_credits.csv")


### Data Preprocessing

In [4]:
movies = movies.merge(credits, on='title')      #Merged on the basis of title

In [5]:
# Genres, id , Keywords, title, overview, cast, crew            Columns to keep

movies = movies[['id','title','overview','genres','keywords','cast','crew']]

In [6]:
movies.isnull().sum()       #Check for null values 

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)     #Drop the rows where null values exists

In [8]:
movies.duplicated().sum()       #Check for duplicates

np.int64(0)

In [9]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [10]:
import ast
def convert(obj):  
    list = [i['name'] for i in ast.literal_eval(obj) ]
    return list

In [11]:
movies['genres'] = movies['genres'].apply(convert)  #Preprocess the Genres Column

In [12]:
movies['keywords'] = movies['keywords'].apply(convert)      #Preprocess the keywords Column

In [14]:
def convert_cast(obj):
    list = [ j['name']  for j in ast.literal_eval(obj)[:3] ]
    return list

In [15]:
movies['cast'] = movies['cast'].apply(convert_cast)     #Preprocess the cast Column

In [16]:
def fetch_director(obj):
    list = [ j['name']  for j in ast.literal_eval(obj) if j['job'] == 'Director' ]
    return list

In [17]:
movies['crew'] = movies['crew'].apply(fetch_director)        #Preprocess the crew Column by fetching Director name

In [18]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())       # Convert the overview column from str into a list

In [19]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [20]:
#Remove space in between the values 

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])


In [21]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [22]:
# Creating another column tags to concatenate Genres, Keywords, overview, cast, crew  columns

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [23]:
new_df = movies[['id','title','tags']]  #New dataframe

In [24]:
new_df # The New dataframe consists of id, title, tags

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [25]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))     #Convert the list into string for column tag


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))     #Convert the list into string for column tag


In [26]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())            #Convert all string into lowercase for column tag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())            #Convert all string into lowercase for column tag


In [27]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

### Normalization of text
* using Lemmatization


In [28]:
#Lemmatization to normalize the text

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wl = WordNetLemmatizer()
stopwords = stopwords.words('english')


  from scipy.stats import fisher_exact


In [29]:
def lemmatize_text(text):
    words = text.split()
    words = [wl.lemmatize(i) for i in words if i not in stopwords]
    return " ".join(words)    

In [30]:
new_df['tags'] = new_df['tags'].apply(lemmatize_text) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lemmatize_text)


### Vectorization of text

* Technique:
Bag of Words(BOW) with N_grams

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=10000,ngram_range=(1,2))
vectors = cv.fit_transform(new_df['tags']).toarray()

### Distance calculation using Cosine Similarity

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
similarity = cosine_similarity(vectors)

### Function For the Recommendation

In [None]:
def recommend(movie, threshold=0.15):
    movie_index = new_df[new_df['title'] == movie].index[0]     # Get the index of the mo
    
    distances = similarity[movie_index]                         # Cosine similarities with all the other movies

    # Get (index, score) pairs where score >= threshold 
    filtered_movies = [(i,score) for i, score in enumerate(distances) if  score >= threshold]
    
    # Sort by similarity score in descending order and take top 5
    top_movies = sorted(filtered_movies, reverse=True, key=lambda x: x[1])[1:6]
    
    
    for i, _ in top_movies:
        print(new_df.iloc[i]['title'])
    

In [None]:
recommend('Avatar')

Aliens
Independence Day
Titan A.E.
Small Soldiers
Ender's Game
