In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import pymongo
import json
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from textblob import TextBlob, Word

from scipy.stats import pearsonr

In [2]:
movies_df=pd.read_csv('./input/movies_metadata.csv',low_memory=False)
credits_df=pd.read_csv('./input/credits.csv')
keywords_df=pd.read_csv("./input/keywords.csv")

In [3]:
#initial sizes
print(movies_df.shape,credits_df.shape,keywords_df.shape)

(45466, 24) (45476, 3) (46419, 2)


In [4]:
keywords_df = keywords_df.drop_duplicates(subset='id')

In [5]:
#cleaning data
movies_df = movies_df.drop_duplicates(subset='id')  #remove duplicates 
keywords_df = keywords_df.drop_duplicates(subset='id')
credits_df=credits_df.drop_duplicates(subset='id')

credits_df["id"]=credits_df["id"].astype(object)    #convert int to object type

#removing non int id data
for index,row in movies_df.iterrows():
	try:
		row["id"]=int(row["id"])
	except:
		movies_df.drop(index,axis=0,inplace=True)
		
movies_df["id"]=pd.to_numeric(movies_df["id"])      #convert string into int type

#45432 
print(movies_df.shape)
movies_df=movies_df.dropna(subset=['overview'])

(45433, 24)


In [6]:
#merge the movies,credits and keywords datafrane
movies_copy0=pd.merge(movies_df,credits_df,how='left',right_on='id',left_on='id')
movies_copy0=pd.merge(movies_copy0,keywords_df,how='left',right_on='id',left_on='id')

In [7]:
#drop unwanted columns
columns_to_drop=["belongs_to_collection","budget","original_title","video",'spoken_languages','spoken_languages','production_companies',
       'production_countries','revenue']
movies_copy0 = movies_copy0.drop(columns_to_drop, axis=1)

In [8]:
#after merging drop any more duplicates which exist
movies_copy0=movies_copy0.drop_duplicates(subset="id")

In [9]:
c=movies_copy0["vote_average"].mean()
m=movies_copy0["vote_count"].quantile(0.9)

In [10]:
movies_copy0.shape

(44479, 19)

In [11]:
def rating(x,M=m,C=c):
    v=x['vote_count']
    r=x['vote_average']
    return (r*v + c*m)/(v+m)

In [12]:
movies_copy0['score']=movies_copy0.apply(rating,axis=1)

In [13]:
#get the top 3 values of the objects in the genre,cast and keywords columns
def get_val(row):
    dic=literal_eval(row)
    lst=[]
    count=0
    for d in dic:
        count+=1
        lst.append(d['name'])
        if (count==max):
            break
    return lst

#get all the values(for keywords)
def get_valmax(row):
    dic=literal_eval(row)
    lst=[]
    for d in dic:
        lst.append(d['name'])
    return lst

#getting only the director name from the crew
def get_dir(row):
    lst=[]
    dic=literal_eval(row)
    for i in dic:
        if i['job']=='Director':
            lst.append(i['name'])
            break
    return lst
    # return [d['name'] for d in dic]

In [14]:
movies_copy0 = movies_copy0.dropna(subset="crew")
movies_copy0=movies_copy0.dropna(subset='keywords')
movies_copy0=movies_copy0.dropna(subset='overview')

In [15]:
movies_copy0.shape

(44478, 20)

In [16]:
movies_copy0['Director']=list(movies_copy0['crew'].apply(get_dir))
movies_copy0['cast']=movies_copy0['cast'].apply(get_val)
movies_copy0['genres']=movies_copy0['genres'].apply(get_valmax)
movies_copy0['keywords']=movies_copy0['keywords'].apply(get_valmax)

In [17]:
movies_copy0['overview']=movies_copy0['overview'].apply(lambda x:x.split())

In [18]:
movies_copy0['Director']=movies_copy0['Director'].fillna('')

In [19]:
movies_copy0['cast'] = movies_copy0['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies_copy0['genres'] = movies_copy0['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies_copy0['keywords'] = movies_copy0['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies_copy0['Director']=movies_copy0['Director'].apply(lambda x:[i.replace(" ","") for i in x])

In [20]:
def create_fet(x):
    return  ' '.join(x['keywords'])+' '+" ".join(x['cast'])+' '.join(x['genres'])+" ".join(x['Director'])

In [21]:
movies_copy0['tags']=movies_copy0.apply(create_fet,axis=1)

In [22]:
movies_copy0['tags']=movies_copy0['tags'].apply(lambda x:x.lower())

In [23]:
tfdif=CountVectorizer(max_features=20000,stop_words="english")

# trying old one

In [24]:
# count_matrix = tfdif.fit_transform(movies_copy0['tags'])
# count_matrix.shape

In [25]:
# cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# cosine_sim2.shape

In [26]:
# indices = pd.Series(movies_copy0.index, index = movies_copy0['title'])

In [27]:
# def get_recommendations(title, cosine_sim = cosine_sim2):
#     idx = indices[title]

#     sim_scores = list(enumerate(cosine_sim[idx]))  # Get the similarity scores of all movies wrt input movie
#     sim_scores = sorted(sim_scores, key = lambda x : x[1], reverse = True)
#     sim_scores = sim_scores[1:31]

#     movie_indices = [i[0] for i in sim_scores]

#     return movies_copy0['title'].iloc[movie_indices]


In [28]:
# get_recommendations('John Wick: Chapter 2', cosine_sim2)

# Old over
1.get_recommendations('John Wick', cosine_sim2)

1. 28663    Kleines Arschloch - Der Film
1. 12066                 The Dog Problem
1. 16074             The Bread and Alley
1. 31822               The Biscuit Eater
1. 35244        The Spy with a Cold Nose
1. 41104              Red Dog: True Blue
1. 40506                     Dog Eat Dog
1. 34697    Devil Dog: The Hound of Hell
1. 42983                   Heavy Petting
1. 24236             Alone For Christmas
1. Name: title, dtype: object

In [29]:
lemmatizer = WordNetLemmatizer()
ps=PorterStemmer()

In [30]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [31]:
def lemmatizerfunc(sentence):
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

In [32]:
def stemmer(sen):
    lst=[]
    for i in sen.split():
        lst.append(ps.stem(i))
    return " ".join(lst)

In [33]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

In [34]:
# movies_copy0['l1']=movies_copy0['tags'].apply(lemmatizerfunc)
# movies_copy0['l2']=movies_copy0['tags'].apply(stemmer)
movies_copy0['l3']=movies_copy0['tags'].apply(lemmatize_with_postag)

ran here down

In [35]:
def upload_df():
    json_data = movies_copy0.to_json(orient="records")
    records = json.loads(json_data)
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["MoviesBig"]
    collection = db["meta"]
    collection.insert_many(records)

In [36]:
# vec1=tfdif.fit_transform(movies_copy0['l1']).toarray()
# vec2=tfdif.fit_transform(movies_copy0['l2']).toarray()
vec3=tfdif.fit_transform(movies_copy0['l3']).toarray()

### Cosine Similarity

In [37]:
# sim1= cosine_similarity(vec1)
# sim2= cosine_similarity(vec2)
sim3= cosine_similarity(vec3)

### Pearsons Relation

In [38]:
# n_rows = vec1.shape[0]
# correlation_matrix = np.zeros((n_rows, n_rows))
# for i in range(n_rows):
#     for j in range(n_rows):
#         r, p_value = pearsonr(vec1[i], vec1[j])
#         correlation_matrix[i, j] = r

In [39]:
def recommend(movie,similarity):
    movie_titles=movies_copy0['title'].values
    if movie not in movie_titles:
        print(movie,"Movie not Recognized")
        return
    index = movies_copy0[movies_copy0['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:70]:
        print(movies_copy0.iloc[i[0]].title , i)

In [45]:
recommend("John Wick",sim3)

John Wick: Chapter 2 (41328, 0.2926847035024818)
Stormheart (25370, 0.19611613513818404)
To Kill a Man (31682, 0.19611613513818404)
Revenge (43045, 0.19611613513818404)
Strong Island (43840, 0.19611613513818404)
Sexy Beast (4230, 0.179968508266339)
Pimp Bullies (25926, 0.17541160386140586)
Mother's Day (5051, 0.16012815380508716)
Creature with the Atom Brain (21231, 0.16012815380508716)
A House In The Hills (28983, 0.16012815380508716)
Summer in the Golden Valley (37912, 0.16012815380508716)
Detective Conan: The Fourteenth Target (39027, 0.16012815380508716)
Run Bitch Run (40208, 0.16012815380508716)
Wild Target (16041, 0.15384615384615385)
Eliminators (40651, 0.15191090506254998)
Things to Do in Denver When You're Dead (79, 0.14824986333222023)
China Strike Force (10048, 0.14824986333222023)
The Dagger of Kamui (11567, 0.14824986333222023)
Ultimate Heist (30973, 0.14824986333222023)
Essex Boys: Law of Survival (32717, 0.14824986333222023)
Naanum Rowdydhaan (37544, 0.14824986333222023)

# CosineSimilarity Results

## Lemmetizer Results for The Batman

1. The Dark Knight
1. The Oil, the Baby and the Transylvanians
1. The Testimony
1. Virtual Weapon
1. Scarlet Eye
1. The Last Breath
1. Mord in Eberswalde
1. All at Once
1. Batman v Superman: Dawn of Justice
1. Batman: The Dark Knight Returns, Part 1
1. The Dark Knight Rises
1. Batman: Under the Red Hood
1. Helter Skelter
1. Batman & Robin
1. The Demolitionist
1. Bonnie and Clyde Italian Style
1. Nighthawks
1. Superman
1. Dead Man Down
1. Batman Unmasked: The Psychology of the Dark Knight
1. Batman Forever
1. The Worthless
1. The Green Hornet
1. Blue Hill Avenue
1. The Phenix City Story
1. LEGO DC Comics Super Heroes: Batman: Be-Leaguered
1. Teenage Mutant Ninja Turtles
1. The Raid 2
1. Batman: Year One

## Lemmetizer with Postag results for "The Batman"

1. The Dark Knight
1. The Oil, the Baby and the Transylvanians
1. The Testimony
1. Virtual Weapon
1. Scarlet Eye
1. The Last Breath
1. Mord in Eberswalde
1. All at Once
1. Batman: The Dark Knight Returns, Part 1
1. Batman & Robin
1. Batman v Superman: Dawn of Justice
1. The Dark Knight Rises
1. Batman: Under the Red Hood
1. Helter Skelter
1. Bonnie and Clyde Italian Style
1. The Demolitionist
1. Superman
1. Dead Man Down
1. Batman Forever
1. The Worthless
1. The Green Hornet
1. Blue Hill Avenue
1. The Phenix City Story
1. Batman Unmasked: The Psychology of the Dark Knight
1. LEGO DC Comics Super Heroes: Batman: Be-Leaguered
1. Teenage Mutant Ninja Turtles
1. Little Criminals
1. Batman: Year One
1. Batman: Bad Blood

## Stemmer Tokenizer Results for "The Batman"

1. The Dark Knight
1. The Raid 2
1. Superman
1. The Avenue
1. Beck 28 - Familjen
1. Once Fallen
1. Sonny
1. Synecdoche, New York
1. Rockaway
1. Brother
1. Walking Tall: The Payback
1. Days of Santiago
1. The Face Behind the Mask
1. Cyclo
1. Driven To Kill
1. Kiwi!
1. A Skin Too Few: The Days of Nick Drake
1. American Me
1. The Bastard
1. Sexy Beast
1. Rolling Thunder
1. Graveyard of Honor
1. Lockdown
1. Run
1. The Oil, the Baby and the Transylvanians
1. Twelve
1. The Testimony
1. Virtual Weapon
1. Scarlet Eye

## Lemmetize with postag
### Changed the dataset a bit
1. The Dark Knight
1. Batman: The Dark Knight Returns, Part 1
1. Batman & Robin
1. Batman v Superman: Dawn of Justice
1. The Dark Knight Rises
1. Batman: Under the Red Hood
1. Helter Skelter
1. Bonnie and Clyde Italian Style
1. The Demolitionist
1. Superman
1. Dead Man Down
1. Batman Forever
1. The Worthless
1. The Green Hornet
1. Blue Hill Avenue
1. The Phenix City Story
1. Batman Unmasked: The Psychology of the Dark Knight
1. LEGO DC Comics Super Heroes: Batman: Be-Leaguered
1. Teenage Mutant Ninja Turtles
1. Little Criminals
1. Batman: Year One
1. Batman: Bad Blood
1. Ab Tak Chhappan
1. Human Target
1. The Miami Story
1. Brother
1. Essex Boys: Law of Survival
1. Batman Beyond: The Movie
1. Red Dust


In [41]:
sorted_movies=movies_copy0.sort_values('score',ascending=False)

In [42]:
sorted_movies['title']

312         The Shawshank Redemption
823                    The Godfather
10276    Dilwale Dulhania Le Jayenge
12443                The Dark Knight
2824                      Fight Club
                    ...             
11522                     Epic Movie
13523           Dragonball Evolution
19592                            NaN
29159                            NaN
35008                            NaN
Name: title, Length: 44478, dtype: object

In [43]:
json_data = sorted_movies.to_json(orient="records")
records = json.loads(json_data)

In [44]:
with open("sorted_movies.json", "w") as f:
    json.dump(records, f)

# Recomendations Based on Genre

In [51]:
movies_copy0

Unnamed: 0,adult,genres,homepage,id,imdb_id,original_language,overview,popularity,poster_path,release_date,...,title,vote_average,vote_count,cast,crew,keywords,score,Director,tags,l3
0,FALSE,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,"[Led, by, Woody,, Andy's, toys, live, happily,...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,30-10-1995,...,Toy Story,7.7,5415.0,"[TomHanks, TimAllen, DonRickles, JimVarney, Wa...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",7.639057,[JohnLasseter],jealousy toy boy friendship friends rivalry bo...,jealousy toy boy friendship friends rivalry bo...
1,FALSE,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,"[When, siblings, Judy, and, Peter, discover, a...",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,15-12-1995,...,Jumanji,6.9,2413.0,"[RobinWilliams, JonathanHyde, KirstenDunst, Br...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgame, disappearance, basedonchildren'sbo...",6.819293,[JoeJohnston],boardgame disappearance basedonchildren'sbook ...,boardgame disappearance basedonchildren'sbook ...
2,FALSE,"[Romance, Comedy]",,15602,tt0113228,en,"[A, family, wedding, reignites, the, ancient, ...",11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,22-12-1995,...,Grumpier Old Men,6.5,92.0,"[WalterMatthau, JackLemmon, Ann-Margret, Sophi...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, bestfriend, duringcreditsstinger, ol...",5.947230,[HowardDeutch],fishing bestfriend duringcreditsstinger oldmen...,fish bestfriend duringcreditsstinger oldmen wa...
3,FALSE,"[Comedy, Drama, Romance]",,31357,tt0114885,en,"[Cheated, on,, mistreated, and, stepped, on,, ...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,22-12-1995,...,Waiting to Exhale,6.1,34.0,"[WhitneyHouston, AngelaBassett, LorettaDevine,...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",5.717779,[ForestWhitaker],basedonnovel interracialrelationship singlemot...,basedonnovel interracialrelationship singlemot...
4,FALSE,[Comedy],,11862,tt0113041,en,"[Just, when, George, Banks, has, recovered, fr...",8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,10-02-1995,...,Father of the Bride Part II,5.7,173.0,"[SteveMartin, DianeKeaton, MartinShort, Kimber...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlifecrisis, confidence, aging, daugh...",5.670231,[CharlesShyer],baby midlifecrisis confidence aging daughter m...,baby midlifecrisis confidence age daughter mot...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44474,FALSE,"[Drama, Family]",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,"[Rising, and, falling, between, a, man, and, w...",0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,,...,Subdue,4.0,1.0,"[LeilaHatami, KouroshTahami, ElhamKorda]","[{'credit_id': '5894a97d925141426c00818c', 'de...",[tragiclove],5.629145,[HamidNematollah],tragiclove leilahatami kouroshtahami elhamkord...,tragiclove leilahatami kouroshtahami elhamkord...
44475,FALSE,[Drama],,111109,tt2028550,tl,"[An, artist, struggles, to, finish, his, work,...",0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,17-11-2011,...,Century of Birthing,9.0,3.0,"[AngelAquino, PerryDizon, HazelOrencio, JoelTo...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...","[artist, play, pinoy]",5.699036,[LavDiaz],artist play pinoy angelaquino perrydizon hazel...,artist play pinoy angelaquino perrydizon hazel...
44476,FALSE,"[Action, Drama, Thriller]",,67758,tt0303758,en,"[When, one, of, her, hits, goes, wrong,, a, pr...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,01-08-2003,...,Betrayal,3.8,6.0,"[ErikaEleniak, AdamBaldwin, JulieduPage, James...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",[],5.574492,[MarkL.Lester],erikaeleniak adambaldwin juliedupage jamesrem...,erikaeleniak adambaldwin juliedupage jamesrema...
44477,FALSE,[],,227506,tt0008536,en,"[In, a, small, town, live, two, brothers,, one...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,21-10-1917,...,Satan Triumphant,0.0,0.0,"[IwanMosschuchin, NathalieLissenko, PavelPavlo...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",[],5.639019,[YakovProtazanov],iwanmosschuchin nathalielissenko pavelpavlov ...,iwanmosschuchin nathalielissenko pavelpavlov a...


In [52]:
def create_genrelst(x):
    return ' '.join(x['genres'])

In [53]:
movies_copy0['genrelst']=movies_copy0.apply(create_genrelst)

KeyError: 'genres'

In [None]:
movies_copy0['gl3']=movies_copy0['genrelst'].apply(lemmatize_with_postag)