In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy import spatial
from IPython.display import display
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA  # to apply PCA
from rake_nltk import Rake
import nltk
import re

In [3]:
# Load the Hindi movies dataset
movies_hindi = pd.read_csv("data/bollywood_full_1950-2019.csv")

In [4]:
# Select relevant columns and rename them
movies_hindi = movies_hindi[["imdb_id", 'original_title', "genres", "story", "actors"]]
movies_hindi.rename(columns= {"actors": "cast", "imdb_id": "id"}, inplace=True)

In [5]:
movies_hindi

Unnamed: 0,id,original_title,genres,story,cast
0,tt8291224,Uri: The Surgical Strike,Action|Drama|War,Divided over five chapters the film chronicle...,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...
1,tt9472208,Battalion 609,War,The story revolves around a cricket match betw...,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...
2,tt6986710,The Accidental Prime Minister,Biography|Drama,Based on the memoir by Indian policy analyst S...,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...
3,tt8108208,Why Cheat India,Crime|Drama,The movie focuses on existing malpractices in ...,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...
4,tt6028796,Evening Shadows,Drama,While gay rights and marriage equality has bee...,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...
...,...,...,...,...,...
4325,tt0268614,Samadhi,Drama,The story is based on the true incident at INA...,Ashok Kumar|Nalini Jaywant|Kuldip Kaur|Shyam|M...
4326,tt0244182,Sangram,Drama,After the death of his wife a policeman fails...,Ashok Kumar|Nalini Jaywant|Nawab|Sajjan|Tiwari...
4327,tt0269826,Sargam,Drama|Family,,Raj Kapoor|Rehana|Om Prakash|David Abraham|Rad...
4328,tt0243555,Sheesh Mahal,Drama,Thakur Jaspal Singh lives in the prestigious a...,Sohrab Modi|Naseem Banu|Pushpa Hans|Nigar Sult...


In [6]:
# Load additional datasets for crew information
movies_hindi_makers = pd.read_csv("data/bollywood_crew_1950-2019.csv")
movies_hindi_crew_data = pd.read_csv("data/bollywood_crew_data_1950-2019.csv")
movies_hindi_writers_data = pd.read_csv("data/bollywood_writers_data_1950-2019.csv")

In [7]:
movies_hindi_makers.head()

Unnamed: 0,imdb_id,directors,writers
0,tt0042184,nm0025608,nm0025608|nm0324690
1,tt0042207,nm0490178,nm0161032|nm1879927
2,tt0042225,nm0707533,\N
3,tt0042233,nm0788880,nm0592578|nm0788880
4,tt0042380,nm0439074,nm1278450|nm0438022|nm1301772


In [8]:
# Filter out movies with missing director or writer information
movies_hindi_makers = movies_hindi_makers[(movies_hindi_makers["directors"] != "\\N") & (movies_hindi_makers["writers"] != "\\N")]

# Merge the datasets to combine movie and crew information
movies_hindi = movies_hindi.merge(movies_hindi_makers, how="left", left_on="id", right_on="imdb_id")

In [9]:
# Drop unnecessary columns
movies_hindi.drop(["imdb_id"], inplace=True, axis=1)
movies_hindi

Unnamed: 0,id,original_title,genres,story,cast,directors,writers
0,tt8291224,Uri: The Surgical Strike,Action|Drama|War,Divided over five chapters the film chronicle...,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,nm2336554,nm2336554
1,tt9472208,Battalion 609,War,The story revolves around a cricket match betw...,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,nm10342614,nm2366632
2,tt6986710,The Accidental Prime Minister,Biography|Drama,Based on the memoir by Indian policy analyst S...,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,nm9049092,nm9049093|nm10376880|nm9049092|nm4577451|nm103...
3,tt8108208,Why Cheat India,Crime|Drama,The movie focuses on existing malpractices in ...,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,nm2265536,nm8006178
4,tt6028796,Evening Shadows,Drama,While gay rights and marriage equality has bee...,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,nm1482161,nm1481242|nm1482161
...,...,...,...,...,...,...,...
4417,tt0268614,Samadhi,Drama,The story is based on the true incident at INA...,Ashok Kumar|Nalini Jaywant|Kuldip Kaur|Shyam|M...,,
4418,tt0244182,Sangram,Drama,After the death of his wife a policeman fails...,Ashok Kumar|Nalini Jaywant|Nawab|Sajjan|Tiwari...,nm0611527,nm0310170|nm0611527
4419,tt0269826,Sargam,Drama|Family,,Raj Kapoor|Rehana|Om Prakash|David Abraham|Rad...,,
4420,tt0243555,Sheesh Mahal,Drama,Thakur Jaspal Singh lives in the prestigious a...,Sohrab Modi|Naseem Banu|Pushpa Hans|Nigar Sult...,,


In [10]:
# Split string columns into lists
movies_hindi["genres"] = movies_hindi["genres"].str.split("|")
movies_hindi["cast"] = movies_hindi["cast"].str.split("|")
movies_hindi["directors"] = movies_hindi["directors"].str.split("|")
movies_hindi["writers"] = movies_hindi["writers"].str.split("|")

In [11]:
movies_hindi.isna().sum()

id                  0
original_title      0
genres              0
story             267
cast               10
directors         846
writers           846
dtype: int64

In [12]:
# Drop rows with missing values and reset index
movies_hindi = movies_hindi.dropna().reset_index()
movies_hindi.head()

Unnamed: 0,index,id,original_title,genres,story,cast,directors,writers
0,0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",Divided over five chapters the film chronicle...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",[nm2336554],[nm2336554]
1,1,tt9472208,Battalion 609,[War],The story revolves around a cricket match betw...,"[Vicky Ahuja, Shoaib Ibrahim, Shrikant Kamat, ...",[nm10342614],[nm2366632]
2,2,tt6986710,The Accidental Prime Minister,"[Biography, Drama]",Based on the memoir by Indian policy analyst S...,"[Anupam Kher, Akshaye Khanna, Aahana Kumra, At...",[nm9049092],"[nm9049093, nm10376880, nm9049092, nm4577451, ..."
3,3,tt8108208,Why Cheat India,"[Crime, Drama]",The movie focuses on existing malpractices in ...,"[Emraan Hashmi, Shreya Dhanwanthary, Snighdade...",[nm2265536],[nm8006178]
4,4,tt6028796,Evening Shadows,[Drama],While gay rights and marriage equality has bee...,"[Mona Ambegaonkar, Ananth Narayan Mahadevan, D...",[nm1482161],"[nm1481242, nm1482161]"


In [13]:
# Map writer and director IDs to their names
def map_writers(writer_list):
    return [movies_hindi_writers_data[movies_hindi_writers_data['crew_id'] == writer_id]['name'].values[0] for writer_id in writer_list]

def map_directors(directors_list):
    return [movies_hindi_crew_data[movies_hindi_crew_data['crew_id'] == director_id]['name'].values[0] for director_id in directors_list]

movies_hindi["director"] = movies_hindi["directors"].apply(map_directors)
movies_hindi["screenplay"] = movies_hindi["writers"].apply(map_writers)

movies_hindi.drop(["directors", "writers"], axis=1, inplace=True)
movies_hindi.head(10)

Unnamed: 0,index,id,original_title,genres,story,cast,director,screenplay
0,0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",Divided over five chapters the film chronicle...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",[Aditya Dhar],[Aditya Dhar]
1,1,tt9472208,Battalion 609,[War],The story revolves around a cricket match betw...,"[Vicky Ahuja, Shoaib Ibrahim, Shrikant Kamat, ...",[Brijesh Batuknath Tripathi],[Bunty Rathore]
2,2,tt6986710,The Accidental Prime Minister,"[Biography, Drama]",Based on the memoir by Indian policy analyst S...,"[Anupam Kher, Akshaye Khanna, Aahana Kumra, At...",[Vijay Ratnakar Gutte],"[Sanjaya Baru, Karl Dunne, Vijay Ratnakar Gutt..."
3,3,tt8108208,Why Cheat India,"[Crime, Drama]",The movie focuses on existing malpractices in ...,"[Emraan Hashmi, Shreya Dhanwanthary, Snighdade...",[Soumik Sen],[Mishkka Shekhawat]
4,4,tt6028796,Evening Shadows,[Drama],While gay rights and marriage equality has bee...,"[Mona Ambegaonkar, Ananth Narayan Mahadevan, D...",[Sridhar Rangayan],"[Saagar Gupta, Sridhar Rangayan]"
5,5,tt6078866,Soni,[Drama],Soni a young policewoman in Delhi and her su...,"[Geetika Vidya Ohlyan, Saloni Batra, Vikas Shu...",[Ivan Ayr],"[Ivan Ayr, Kislay Kislay]"
6,6,tt5013008,Fraud Saiyyan,"[Comedy, Drama]",Fraud Saiyyan is the story of a con artist in ...,"[Arshad Warsi, Saurabh Shukla, Flora Saini, Sa...",[Sourabh Shrivastava],"[Amal Donwaar, Prakash Jha, Sharad Tripathi]"
7,7,tt4971258,Bombairiya,"[Comedy, Crime, Drama]",It follows the story of Meghna who gets embro...,"[Radhika Apte, Akshay Oberoi, Siddhanth Kapoor...",[Pia Sukanya],"[Michael E. Ward, Pia Sukanya, Aarti S. Bagdi]"
8,8,tt6903440,Manikarnika: The Queen of Jhansi,"[Action, Biography, Drama]",Manikarnika born in Varanasi when Dixt a minis...,"[Kangana Ranaut, Rimi Sen, Atul Kulkarni, Naln...","[Radha Krishna Jagarlamudi, Kangana Ranaut]","[Vijayendra Prasad, Prasoon Joshi, Himanshu Tr..."
9,9,tt7777196,Thackeray,"[Biography, Drama]",Balasaheb Thackrey works as a cartoonist for a...,"[Nawazuddin Siddiqui, Amrita Rao, Abdul Quadir...",[Abhijit Panse],[Abhijit Panse]


In [14]:
#  Vectorize and transform the movie stories using TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cv = CountVectorizer(stop_words="english", max_df=0.9, min_df=2)
word_counts = cv.fit_transform(movies_hindi["story"])
print(word_counts)

feautres = cv.get_feature_names_out()
print(feautres)

transformer = TfidfTransformer()
transformer.fit(word_counts)

  (0, 2967)	1
  (0, 1728)	1
  (0, 3699)	1
  (0, 1857)	1
  (0, 3401)	2
  (0, 9629)	1
  (0, 9450)	1
  (0, 2092)	1
  (0, 4753)	2
  (0, 6228)	1
  (0, 9662)	1
  (0, 6227)	1
  (0, 6944)	1
  (0, 6756)	1
  (0, 5335)	1
  (0, 9839)	1
  (0, 9425)	1
  (0, 6)	1
  (0, 10174)	1
  (0, 6811)	2
  (0, 1600)	1
  (0, 686)	2
  (0, 9254)	1
  (0, 3812)	1
  (0, 1603)	1
  :	:
  (3470, 2735)	1
  (3470, 7795)	5
  (3470, 9225)	1
  (3470, 5491)	3
  (3470, 10640)	1
  (3470, 10451)	1
  (3470, 9253)	1
  (3470, 9622)	1
  (3470, 637)	1
  (3470, 9552)	1
  (3470, 9616)	1
  (3470, 7391)	1
  (3470, 6551)	1
  (3471, 9198)	1
  (3471, 10667)	1
  (3471, 1802)	1
  (3471, 2523)	1
  (3471, 2324)	1
  (3471, 10724)	1
  (3471, 4090)	1
  (3471, 1423)	1
  (3471, 7295)	1
  (3471, 3551)	1
  (3471, 306)	1
  (3471, 8118)	1
['000' '10' '100' ... 'zoya' 'zulfi' 'zutshi']


In [15]:
count_vectors = cv.transform(movies_hindi["story"])
tfidf_vectors = transformer.transform(count_vectors).tocoo()

In [16]:
# Create a DataFrame to store TF-IDF information
tuples = zip(tfidf_vectors.row, tfidf_vectors.col, tfidf_vectors.data)
tfidf_vectors = sorted(tuples, key=lambda x: (x[0], -x[2]))
tf_idf_df = pd.DataFrame(tfidf_vectors, columns=["movie_idx", "word_idx", "tfidf"])
tf_idf_df.head(10)

Unnamed: 0,movie_idx,word_idx,tfidf
0,0,6811,0.324335
1,0,686,0.252016
2,0,3401,0.243859
3,0,2304,0.21547
4,0,1728,0.21547
5,0,9629,0.201804
6,0,1857,0.196927
7,0,4753,0.190975
8,0,10174,0.18923
9,0,6227,0.18923


In [17]:
# Group by movie_idx and aggregate word_idx and tfidf into lists
tf_idf_df = tf_idf_df.groupby("movie_idx")[["word_idx", "tfidf"]].agg(list).reset_index()

# Keep only the top 10 keywords for each movie
tf_idf_df["word_idx"] = tf_idf_df["word_idx"].apply(lambda x: x[:10])
tf_idf_df["tfidf"] = tf_idf_df["tfidf"].apply(lambda x: x[:10])

# Map word_idx to feature names
tf_idf_df["keywords"] = tf_idf_df["word_idx"].apply(lambda x: [feautres[word_id] for word_id in x])

In [18]:
# Merge TF-IDF information back to the movies_hindi DataFrame
movies_hindi.merge(tf_idf_df[["movie_idx", "keywords"]], how="left", left_index=True, right_index=True)

Unnamed: 0,index,id,original_title,genres,story,cast,director,screenplay,movie_idx,keywords
0,0,tt8291224,Uri: The Surgical Strike,"[Action, Drama, War]",Divided over five chapters the film chronicle...,"[Vicky Kaushal, Paresh Rawal, Mohit Raina, Yam...",[Aditya Dhar],[Aditya Dhar],0,"[operation, army, events, covert, chapters, su..."
1,1,tt9472208,Battalion 609,[War],The story revolves around a cricket match betw...,"[Vicky Ahuja, Shoaib Ibrahim, Shrikant Kamat, ...",[Brijesh Batuknath Tripathi],[Bunty Rathore],1,"[army, cricket, match, battalion, loc, indian,..."
2,2,tt6986710,The Accidental Prime Minister,"[Biography, Drama]",Based on the memoir by Indian policy analyst S...,"[Anupam Kher, Akshaye Khanna, Aahana Kumra, At...",[Vijay Ratnakar Gutte],"[Sanjaya Baru, Karl Dunne, Vijay Ratnakar Gutt...",2,"[congress, gandhi, tenure, manmohan, prime, mi..."
3,3,tt8108208,Why Cheat India,"[Crime, Drama]",The movie focuses on existing malpractices in ...,"[Emraan Hashmi, Shreya Dhanwanthary, Snighdade...",[Soumik Sen],[Mishkka Shekhawat],3,"[education, country, movie, concept, vulnerabi..."
4,4,tt6028796,Evening Shadows,[Drama],While gay rights and marriage equality has bee...,"[Mona Ambegaonkar, Ananth Narayan Mahadevan, D...",[Sridhar Rangayan],"[Saagar Gupta, Sridhar Rangayan]",4,"[gay, conservative, society, ravages, patriarc..."
...,...,...,...,...,...,...,...,...,...,...
3467,4412,tt0042233,Bawre Nain,"[Drama, Romance]",Chand lives in a small rural town in India. He...,"[Raj Kapoor, Geeta Bali, Vijayalaxmi, Jaswant,...",[Kidar Nath Sharma],"[Akhtar Mirza, Kidar Nath Sharma]",3467,"[chand, tara, town, waits, evicts, kindle, una..."
3468,4413,tt0128962,Beqasoor,[Drama],A young simple girl is caught in the cross-fi...,"[Madhubala, Ajit, Pramila, Lala Yaqoob, Mangla...",[K. Amarnath],"[Madhusudan, Ehsan Rizvi]",3468,"[husband, rooted, sibling, wrongly, implicate,..."
3469,4414,tt0331186,Babul,[Drama],This is the story of two widower fathers in Ma...,"[Nargis, Dilip Kumar, Amar, A. Shah Shikarpuri...",[S.U. Sunny],[Azm Bazidpuri],3469,"[ashok, bela, usha, average, madhopur, jamnada..."
3470,4415,tt0042380,Dastan,"[Drama, Family, Romance]",Indira (Suraiya) is an orphan brought up the s...,"[Suraiya, Raj Kapoor, Veena, Suresh, Al Nasir,...",[Abdul Rashid Kardar],"[S.N. Banerjee, Jagdish Kanwal, S.K. Prem]",3470,"[indira, ramesh, rani, kundan, raj, marry, doe..."


In [19]:
# Use Rake to extract keywords from movie stories
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91916\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91916\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
r = Rake()

def get_keywords(story):
    r.extract_keywords_from_text(story)
    keywordList           = []
    rankedList            = r.get_ranked_phrases_with_scores()
    for keyword in rankedList:
        keyword_updated       = keyword[1].split()
        keyword_updated_string    = " ".join(keyword_updated[:2])
        keywordList.append(keyword_updated_string)
        if(len(keywordList)>5):
            break
    return keywordList

movies_hindi["keywords"] = movies_hindi["story"].apply(get_keywords)

In [21]:
# Group by movie_id and keep only the first row for each movie
movies_hindi = movies_hindi.groupby("id").first().reset_index()
movies_hindi.drop(columns=["index"], inplace=True)
movies_hindi

Unnamed: 0,id,original_title,genres,story,cast,director,screenplay,keywords
0,tt0042184,Afsar,"[Comedy, Romance]",Synopsis Has been Written by Mr Rais Asghar M...,"[Dev Anand, Suraiya, Ruma Guha Thakurta, Kanha...",[Chetan Anand],"[Chetan Anand, Nikolay Gogol]","[director chetan, 1950 starring, really enjoya..."
1,tt0042207,Arzoo,"[Musical, Romance]",Badal a clown villager and Kammo are childhoo...,"[Kamini Kaushal, Dilip Kumar, Shashikala, Neel...",[Shaheed Latif],"[Ismat Chughtai, Asghar Jafri]","[take revenge, past affair, badal joins, villa..."
2,tt0042233,Bawre Nain,"[Drama, Romance]",Chand lives in a small rural town in India. He...,"[Raj Kapoor, Geeta Bali, Vijayalaxmi, Jaswant,...",[Kidar Nath Sharma],"[Akhtar Mirza, Kidar Nath Sharma]","[attractive woman, young woman, get married, s..."
3,tt0042380,Dastan,"[Drama, Family, Romance]",Indira (Suraiya) is an orphan brought up the s...,"[Suraiya, Raj Kapoor, Veena, Suresh, Al Nasir,...",[Abdul Rashid Kardar],"[S.N. Banerjee, Jagdish Kanwal, S.K. Prem]","[three suitors, indira really, veena )., sures..."
4,tt0043306,Awaara,"[Drama, Musical, Romance]",Raju lives as a derelict as a result of being ...,"[Prithviraj Kapoor, Nargis, Raj Kapoor, K.N. S...",[Raj Kapoor],"[Khwaja Ahmad Abbas, V.P. Sathe]","[must pass, house years, tries killing, pickpo..."
...,...,...,...,...,...,...,...,...
3333,tt9495690,Pagalpanti,"[Action, Comedy]",A tourist group from India sets out on a vacat...,"[Kriti Kharbanda, John Abraham, Ileana D'Cruz,...",[Anees Bazmee],"[Anees Bazmee, Shreya Dev Verma]","[tourist group, patriotic mission, india sets,..."
3334,tt9496212,22 Yards,[Sport],A dramatic portrayal of a victorious tale of a...,"[Barun Sobti, Rajit Kapur, Panchhi Bora, Karti...",[Mitali Ghoshal],[Samrat],"[fallen cricket, young cricketer, victorious t..."
3335,tt9558612,PM Narendra Modi,"[Biography, Drama]",Biography of India's Prime Minister Shri Naren...,"[Boman Irani, Vivek Oberoi, Barkha Bisht, Dars...",[Omung Kumar],"[Anirudh Chawla, Harsh Limbachiyaa, Vivek Ober...","[prime minister, india, biography]"
3336,tt9680136,Pati Patni Aur Woh,"[Comedy, Romance]",Chintu Tyagi is an ordinary middle class man ...,"[Kartik Aaryan, Bhumi Pednekar, Ananya Panday,...",[Mudassar Aziz],"[Mudassar Aziz, B.R. Chopra, Jasmeet K. Reen]","[ordinary middle, chintu tyagi, another woman,..."


In [22]:
# Save cleaned Hindi movies DataFrame to a CSV file
movies_hindi.to_csv("data/all_hindi_movies_cleaned.csv", sep=",", index=True, encoding="utf-8")

In [23]:
# Clean up column names and remove prefix from IMDb IDs
movies_hindi.rename(columns={"imdb_id": "id"})
movies_hindi["id"] = movies_hindi["id"].apply(lambda x: x[2:])

In [24]:
# Load the English movies datasets
movies = pd.read_csv("data/tmdb_5000_movies.csv")
credits = pd.read_csv("data/tmdb_5000_credits.csv")

In [25]:
pd.set_option("display.max_colwidth",80)

In [26]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora o...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""G...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is hea...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Film...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 80, ""...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4...",en,Spectre,A cryptic message from Bond’s past sends him on a trail to uncover a siniste...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danjaq"", ""id"": 10761}, {""...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639_1"": ""en"", ""name"": ""...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""i...",en,The Dark Knight Rises,"Following the death of District Attorney Harvey Dent, Batman assumes respons...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""name"": ""Warner Bros."", ""id"": 6...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 878, ...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"": 839, ""name"": ""mars""}, {""id"": ...",en,John Carter,"John Carter is a war-weary, former military captain who's inexplicably trans...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [27]:
movies.shape

(4803, 20)

In [28]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""credit_id"": ""5602a8a7c3a368553...","[{""credit_id"": ""52fe48009251416c750aca23"", ""department"": ""Editing"", ""gender""..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Sparrow"", ""credit_id"": ""52fe4232c...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""department"": ""Camera"", ""gender"":..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""credit_id"": ""52fe4d22c3a368484e1...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""department"": ""Sound"", ""gender"": ..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Batman"", ""credit_id"": ""52fe4781c...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""department"": ""Sound"", ""gender"": ..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""credit_id"": ""52fe479ac3a36847f8...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""department"": ""Writing"", ""gender""..."


In [29]:
credits.shape

(4803, 4)

In [30]:
#changing the genres column from json to string
movies["genres"] = movies["genres"].apply(json.loads)
print(movies["genres"])

0       [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, '...
1       [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 28, ...
2       [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 80, '...
3       [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 18, 'name...
4       [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 878, ...
                                             ...                                       
4798    [{'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': 53, 'name...
4799                   [{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]
4800    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...
4801                                                                                 []
4802                                                [{'id': 99, 'name': 'Documentary'}]
Name: genres, Length: 4803, dtyp

In [31]:
# Extract genre names from the genres column
for index,i in zip(movies.index,movies["genres"]):
    list1 = []
    for j in range(len(i)):
        list1.append(i[j]["name"])
    movies.loc[index,"genres"] = str(list1)
    

In [32]:
# Extract and clean keywords, production_companies, cast, and crew information
movies["keywords"] = movies["keywords"].apply(json.loads)

for index,i in zip(movies.index,movies["keywords"]):
    list2 = []
    
    for j in range(len(i)):
        list2.append(i[j]["name"])
        
    movies.loc[index,"keywords"] = str(list2)

movies["production_companies"] = movies["production_companies"].apply(json.loads)

for index,i in zip(movies.index,movies["production_companies"]):
    list3 = []
    
    for j in range(len(i)):
        list3.append(i[j]["name"])
        
    movies.loc[index,"production_companies"] = str(list3)
   
credits["cast"] = credits["cast"].apply(json.loads)

for index,i in zip(credits.index,credits["cast"]):
    list4 = []
    
    for j in range(len(i)):
        list4.append(i[j]["name"])
        
    credits.loc[index,"cast"] = str(list4)

    
    
credits["crew"] = credits["crew"].apply(json.loads)

for index,i in zip(credits.index,credits["crew"]):
    list4 = []
    director = "None"
    screenplay = "None"
    
    for j in range(len(i)):
        if(i[j]["job"] == "Director"):
            director = i[j]["name"]
        if(i[j]["job"] == "Screenplay"):
            screenplay = i[j]["name"]
        list4.append(i[j]["name"])
        
    credits.loc[index,"crew"] = str(list4)
    credits.loc[index,"director"] = director
    credits.loc[index,"screenplay"] = screenplay

In [33]:
credits[["cast","crew","director","screenplay"]]

Unnamed: 0,cast,crew,director,screenplay
0,"['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver', 'Stephen Lang', 'Mich...","['Stephen E. Rivkin', 'Rick Carter', 'Christopher Boyes', 'Christopher Boyes...",James Cameron,James Cameron
1,"['Johnny Depp', 'Orlando Bloom', 'Keira Knightley', 'Stellan Skarsgård', 'Ch...","['Dariusz Wolski', 'Gore Verbinski', 'Jerry Bruckheimer', 'Ted Elliott', 'Te...",Gore Verbinski,Terry Rossio
2,"['Daniel Craig', 'Christoph Waltz', 'Léa Seydoux', 'Ralph Fiennes', 'Monica ...","['Thomas Newman', 'Sam Mendes', 'Anna Pinnock', 'John Logan', 'John Logan', ...",Sam Mendes,Jez Butterworth
3,"['Christian Bale', 'Michael Caine', 'Gary Oldman', 'Anne Hathaway', 'Tom Har...","['Hans Zimmer', 'Charles Roven', 'Christopher Nolan', 'Christopher Nolan', '...",Christopher Nolan,Jonathan Nolan
4,"['Taylor Kitsch', 'Lynn Collins', 'Samantha Morton', 'Willem Dafoe', 'Thomas...","['Andrew Stanton', 'Andrew Stanton', 'John Lasseter', 'Colin Wilson', 'Gail ...",Andrew Stanton,Mark Andrews
...,...,...,...,...
4798,"['Carlos Gallardo', 'Jaime de Hoyos', 'Peter Marquardt', 'Reinol Martinez', ...","['Robert Rodriguez', 'Robert Rodriguez', 'Robert Rodriguez', 'Robert Rodrigu...",Robert Rodriguez,
4799,"['Edward Burns', 'Kerry Bishé', 'Marsha Dietlein', 'Caitlin Fitzgerald', 'Da...","['Edward Burns', 'Edward Burns', 'Edward Burns', 'William Rexer', 'William R...",Edward Burns,
4800,"['Eric Mabius', 'Kristin Booth', 'Crystal Lowe', 'Geoff Gustafson', 'Benjami...","['Carla Hetland', 'Harvey Kahn', 'Adam Sliwinski', 'Martha Williamson', 'Mar...",Scott Smith,
4801,"['Daniel Henney', 'Eliza Coupe', 'Bill Paxton', 'Alan Ruck', 'Zhu Shimao']","['Daniel Hsia', 'Daniel Hsia']",Daniel Hsia,


In [34]:
#Merging both csvs into one movies df with only required features
movies = movies.merge(credits,left_on = "id",right_on = "movie_id",how = "left")
movies = movies[['id', 'original_title', 'genres', 'cast', 'director','screenplay','keywords']]

In [35]:
movies

Unnamed: 0,id,original_title,genres,cast,director,screenplay,keywords
0,19995,Avatar,"['Action', 'Adventure', 'Fantasy', 'Science Fiction']","['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver', 'Stephen Lang', 'Mich...",James Cameron,James Cameron,"['culture clash', 'future', 'space war', 'space colony', 'society', 'space t..."
1,285,Pirates of the Caribbean: At World's End,"['Adventure', 'Fantasy', 'Action']","['Johnny Depp', 'Orlando Bloom', 'Keira Knightley', 'Stellan Skarsgård', 'Ch...",Gore Verbinski,Terry Rossio,"['ocean', 'drug abuse', 'exotic island', 'east india trading company', ""love..."
2,206647,Spectre,"['Action', 'Adventure', 'Crime']","['Daniel Craig', 'Christoph Waltz', 'Léa Seydoux', 'Ralph Fiennes', 'Monica ...",Sam Mendes,Jez Butterworth,"['spy', 'based on novel', 'secret agent', 'sequel', 'mi6', 'british secret s..."
3,49026,The Dark Knight Rises,"['Action', 'Crime', 'Drama', 'Thriller']","['Christian Bale', 'Michael Caine', 'Gary Oldman', 'Anne Hathaway', 'Tom Har...",Christopher Nolan,Jonathan Nolan,"['dc comics', 'crime fighter', 'terrorist', 'secret identity', 'burglar', 'h..."
4,49529,John Carter,"['Action', 'Adventure', 'Science Fiction']","['Taylor Kitsch', 'Lynn Collins', 'Samantha Morton', 'Willem Dafoe', 'Thomas...",Andrew Stanton,Mark Andrews,"['based on novel', 'mars', 'medallion', 'space travel', 'princess', 'alien',..."
...,...,...,...,...,...,...,...
4798,9367,El Mariachi,"['Action', 'Crime', 'Thriller']","['Carlos Gallardo', 'Jaime de Hoyos', 'Peter Marquardt', 'Reinol Martinez', ...",Robert Rodriguez,,"['united states–mexico barrier', 'legs', 'arms', 'paper knife', 'guitar case']"
4799,72766,Newlyweds,"['Comedy', 'Romance']","['Edward Burns', 'Kerry Bishé', 'Marsha Dietlein', 'Caitlin Fitzgerald', 'Da...",Edward Burns,,[]
4800,231617,"Signed, Sealed, Delivered","['Comedy', 'Drama', 'Romance', 'TV Movie']","['Eric Mabius', 'Kristin Booth', 'Crystal Lowe', 'Geoff Gustafson', 'Benjami...",Scott Smith,,"['date', 'love at first sight', 'narration', 'investigation', 'team', 'posta..."
4801,126186,Shanghai Calling,[],"['Daniel Henney', 'Eliza Coupe', 'Bill Paxton', 'Alan Ruck', 'Zhu Shimao']",Daniel Hsia,,[]


In [36]:
# Remove rows with missing values for director or screenplay
movies = movies[(movies["director"]!='None') & (movies["screenplay"]!='None')]
movies["director"] = movies["director"].apply(lambda x: [x])
movies["screenplay"] = movies["screenplay"].apply(lambda x: [x])
movies.reset_index(drop = True,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["director"] = movies["director"].apply(lambda x: [x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["screenplay"] = movies["screenplay"].apply(lambda x: [x])


In [37]:
#Cleaning up genres column
movies["genres"] = movies.loc[:, "genres"].str.strip("[]").str.replace(' ','').str.replace("'",'')
movies["genres"] = movies.loc[:, "genres"].str.split(",")

#Doing the same with cast but only taking the top 5 cast since others are not relevant
movies["cast"] = movies.loc[:, "cast"].str.strip("[]").str.replace(" ","").str.replace("'","")
movies["cast"] = movies.loc[:, "cast"].str.split(",")
movies["cast"] = movies.loc[:, "cast"].apply(lambda x: x[:5])

#Same with keywords
movies["keywords"] = movies.loc[:, "keywords"].str.strip("[]").str.replace(" ","").str.replace("'","").str.split(",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["genres"] = movies.loc[:, "genres"].str.strip("[]").str.replace(' ','').str.replace("'",'')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["genres"] = movies.loc[:, "genres"].str.split(",")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies["cast"] = movies.loc[:, "cast"].str.strip

In [38]:
# Combining hindi and english language dataframes into a single dataframe
movies = pd.concat([movies, movies_hindi[["id", "original_title", "genres", "cast", "director", "screenplay", "keywords"]]])
movies.reset_index(drop=True, inplace=True)

In [39]:
movies

Unnamed: 0,id,original_title,genres,cast,director,screenplay,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver, StephenLang, MichelleRodriguez]",[James Cameron],[James Cameron],"[cultureclash, future, spacewar, spacecolony, society, spacetravel, futurist..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[JohnnyDepp, OrlandoBloom, KeiraKnightley, StellanSkarsgård, ChowYun-fat]",[Gore Verbinski],[Terry Rossio],"[ocean, drugabuse, exoticisland, eastindiatradingcompany, ""loveofoneslife"", ..."
2,206647,Spectre,"[Action, Adventure, Crime]","[DanielCraig, ChristophWaltz, LéaSeydoux, RalphFiennes, MonicaBellucci]",[Sam Mendes],[Jez Butterworth],"[spy, basedonnovel, secretagent, sequel, mi6, britishsecretservice, unitedki..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[ChristianBale, MichaelCaine, GaryOldman, AnneHathaway, TomHardy]",[Christopher Nolan],[Jonathan Nolan],"[dccomics, crimefighter, terrorist, secretidentity, burglar, hostagedrama, t..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[TaylorKitsch, LynnCollins, SamanthaMorton, WillemDafoe, ThomasHadenChurch]",[Andrew Stanton],[Mark Andrews],"[basedonnovel, mars, medallion, spacetravel, princess, alien, steampunk, mar..."
...,...,...,...,...,...,...,...
6269,9495690,Pagalpanti,"[Action, Comedy]","[Kriti Kharbanda, John Abraham, Ileana D'Cruz, Anil Kapoor, Urvashi Rautela,...",[Anees Bazmee],"[Anees Bazmee, Shreya Dev Verma]","[tourist group, patriotic mission, india sets, end well, vacation, turn]"
6270,9496212,22 Yards,[Sport],"[Barun Sobti, Rajit Kapur, Panchhi Bora, Kartikey Tripathi, ]",[Mitali Ghoshal],[Samrat],"[fallen cricket, young cricketer, victorious tale, dramatic portrayal]"
6271,9558612,PM Narendra Modi,"[Biography, Drama]","[Boman Irani, Vivek Oberoi, Barkha Bisht, Darshan Kumaar, Zarina Wahab, Mano...",[Omung Kumar],"[Anirudh Chawla, Harsh Limbachiyaa, Vivek Oberoi, Sandip Ssingh]","[prime minister, india, biography]"
6272,9680136,Pati Patni Aur Woh,"[Comedy, Romance]","[Kartik Aaryan, Bhumi Pednekar, Ananya Panday, Aparshakti Khurana, ]",[Mudassar Aziz],"[Mudassar Aziz, B.R. Chopra, Jasmeet K. Reen]","[ordinary middle, chintu tyagi, another woman, wife, torn, finds]"


In [40]:
# #To get each column present in terms of binary info
# def to_binary(vals,all_vals):
#     binary_list = []
#     for i in all_vals:
#         if i in vals:
#             binary_list.append(1)
#         else:
#             binary_list.append(0)
            
#     return binary_list

# #To get all the unique genres
# genrelist = []
# for i in movies["genres"]:
#     for j in i:
#         if j not in genrelist:
#             genrelist.append(j)
            
# castlist = []
# for i in movies["cast"]:
#     for j in i:
#         if j not in castlist:
#             castlist.append(j)

# keywordslist = []
# for i in movies["keywords"]:
#     for j in i:
#         if j not in keywordslist:
#             keywordslist.append(j)

# directorsList = []
# for i in movies["director"]:
#     for j in i:
#         if j not in directorsList:
#             directorsList.append(j)

# screenplayList = []
# for i in movies["screenplay"]:
#     for j in i:
#         if j not in screenplayList:
#             screenplayList.append(j)

# movies["genres_bin"] = movies["genres"].apply(lambda x: to_binary(x,genrelist))
# movies["cast_bin"] = movies["cast"].apply(lambda x: to_binary(x,castlist))
# movies["director_bin"] = movies["director"].apply(lambda x: to_binary(x,directorsList))
# movies["screenplay_bin"] = movies["screenplay"].apply(lambda x: to_binary(x,screenplayList))
# movies["keywords_bin"] = movies["keywords"].apply(lambda x: to_binary(x,keywordslist))

# movies[["id", "original_title", "genres", "cast", "director", "screenplay", "keywords"]].to_csv("data/all_movies.csv", sep=',', index=False, encoding="utf-8")

# #Finding Similarity between two movie
# def Similarity(movieID1,movieID2):
#     a = movies.iloc[movieID1]
#     b = movies.iloc[movieID2]
    
#     genresA = a["genres_bin"]
#     genresB = b["genres_bin"]
    
#     genreDistance = spatial.distance.cosine(genresA,genresB)
    
#     castA = a["cast_bin"]
#     castB = b["cast_bin"]
    
#     castDistance = spatial.distance.cosine(castA,castB)
    
#     directorA = a["director_bin"]
#     directorB = b["director_bin"]
    
#     directorDistance = spatial.distance.cosine(directorA,directorB)
    
#     screenplayA = a["screenplay_bin"]
#     screenplayB = b["screenplay_bin"]
    
#     screenplayDistance = spatial.distance.cosine(screenplayA,screenplayB)
    
#     keywordsA = a["keywords_bin"]
#     keywordsB = b["keywords_bin"]
    
#     keywordsDistance = spatial.distance.cosine(keywordsA,keywordsB)
    
#     #The more the distance the less similar the two movies are
#     return genreDistance+castDistance+directorDistance+screenplayDistance+keywordsDistance

# #Gets K nearest Neighbours of baseMovie
# import operator
# def getNeighbours(k,basemovie):
#     basemovie = movies.iloc[movies[movies["original_title"].str.contains(basemovie)].original_title.str.len().sort_values().index[0]].to_frame().T
#     print("Selected Movie : "+basemovie["original_title"].values[0])
#     distances = []
#     for index,row in movies.iterrows():
#         if row["id"]!=basemovie["id"].values[0]:
#             distances.append((row["id"],Similarity(row["id"],basemovie["id"].values[0])))
#     distances.sort(key = operator.itemgetter(1))
    
#     neighbours = []
#     for i in range(k):
#         neighbours.append(distances[i][0])
    
#     return neighbours

# def printNeighbours(K,movie):
#     n = getNeighbours(K,movie)
#     print(K,"Similar Movies :-")
#     display(movies[movies.index.isin(n)][["original_title","director","screenplay"]])

# movie = input("Enter Movie to find Similar movies to: ")
# K = int(input("How many Simialr movies to be found : "))
# printNeighbours(K,movie)

In [76]:
movies = movies.groupby("original_title").first().reset_index()
movies["id"] = pd.Series(movies.index)

In [77]:
movies_binless = movies[["original_title", "genres", "cast", "director", "screenplay", "keywords"]]

In [78]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

In [79]:
X = movies_binless[["genres", "cast", "director", "screenplay", "keywords"]]


In [80]:
# Use MultiLabelBinarizer to create binary columns for features
genres_bin = pd.DataFrame(mlb.fit_transform(X["genres"]), columns=mlb.classes_, index=X.index)
cast_bin = pd.DataFrame(mlb.fit_transform(X["cast"]), columns=mlb.classes_, index=X.index)
directors_bin = pd.DataFrame(mlb.fit_transform(X["director"]), columns=mlb.classes_, index=X.index)
screenplay_bin = pd.DataFrame(mlb.fit_transform(X["screenplay"]), columns=mlb.classes_, index=X.index)
keywords_bin = pd.DataFrame(mlb.fit_transform(X["keywords"]), columns=mlb.classes_, index=X.index)

In [81]:
# Combine binary columns into a single dataframe
enc_data = pd.concat([genres_bin, cast_bin, directors_bin, screenplay_bin, keywords_bin], axis=1)

In [82]:
pca = PCA(n_components=600)
pca.fit(enc_data)
pca_data = pca.transform(enc_data)

In [83]:
pca_data = pd.DataFrame(pca_data)

In [84]:
pca_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,599
0,0.426064,0.246550,-1.064932,0.342178,-0.436595,-0.343878,-0.530514,0.034390,0.039250,-0.095853,...,0.010935,0.011855,-0.001166,-0.002503,-0.014752,-0.004380,0.008247,0.019577,-0.001362,-0.007800
1,-0.370034,0.710071,-0.414605,0.164016,-0.858390,-0.122976,0.239358,0.372545,-0.325607,-0.143536,...,0.165210,-0.050647,-0.105792,0.004852,-0.012734,0.049014,-0.001162,0.012825,0.152445,-0.000093
2,-0.764406,0.048483,-0.057528,-0.324501,0.180260,-0.707346,0.177105,0.267554,0.170064,-0.575870,...,0.027681,-0.059382,0.047834,0.123103,0.077849,0.062915,-0.065903,0.087231,0.117629,-0.046558
3,-0.039131,-1.050331,-0.319078,-0.031862,-0.061871,0.842358,-0.132423,0.141191,0.122928,0.027793,...,0.280723,-0.054822,-0.041447,0.023878,0.023287,0.017482,0.180779,-0.094460,0.063521,0.081478
4,-0.748609,-0.438105,0.366627,-0.282272,-0.324272,-0.157065,-0.415714,0.100244,-0.233835,0.028832,...,-0.156633,0.057680,-0.087387,-0.034824,-0.041072,-0.034323,0.146666,-0.049512,-0.077152,0.022322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6106,0.176087,0.455384,-0.388220,-0.579193,0.424840,0.455258,-0.282870,-0.382776,0.303249,0.139359,...,0.011535,-0.009521,0.013939,0.048523,0.002771,0.001513,-0.005873,-0.026883,0.017206,-0.010835
6107,1.013108,-0.105189,0.791359,-0.458990,0.329683,0.363851,-0.186733,0.083063,-0.272434,0.268533,...,-0.029999,-0.041131,-0.024478,0.069926,-0.012268,-0.010106,0.006390,0.007025,-0.019107,-0.072831
6108,0.290414,0.170602,-0.930620,0.046292,-0.272654,-0.071839,-0.421916,0.266245,0.134971,-0.012816,...,-0.002919,0.005212,0.001823,0.018231,-0.005365,0.002484,0.005285,-0.000191,0.000632,0.000539
6109,0.406676,0.515421,-0.290625,-0.719017,0.823924,0.431294,-0.177536,-0.012520,-0.306979,0.268349,...,0.021715,-0.007880,0.011313,0.015954,0.018875,-0.016444,-0.030089,-0.004132,-0.007858,0.015587


In [85]:
# Fit a KNN model using cosine similarity
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(n_neighbors=20, metric="cosine", n_jobs=-1)
model_knn.fit(pca_data)

In [86]:
# A function to clean the search string
def clean_string(input_string):
    # Remove characters that are not letters or numbers
    cleaned_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
    # Remove whitespaces
    cleaned_string = cleaned_string.replace(' ', '')
    # Convert to lowercase
    cleaned_string = cleaned_string.lower()
    return cleaned_string

# Create a function to recommend top 10 movies
def recommend_movies(movie,nn_data,orig_data):
    basemovie = orig_data.iloc[orig_data[orig_data["original_title"].apply(lambda x: clean_string(x)).str.contains(clean_string(movie))].original_title.str.len().sort_values().index[0]].to_frame().T
    movie_index=nn_data[orig_data.original_title==basemovie.original_title.values[0]].index
    _, indices = model_knn.kneighbors(np.array(nn_data.iloc[movie_index]).reshape(1, -1),n_neighbors=15)

    out=orig_data.iloc[indices[0]].iloc[1:, :]
    return out

In [92]:
recommend_movies("spiderman", pca_data, movies)

Unnamed: 0,original_title,id,genres,cast,director,screenplay,keywords
4741,Spider-Man 3,4741,"[Fantasy, Action, Adventure]","[TobeyMaguire, KirstenDunst, JamesFranco, ThomasHadenChurch, TopherGrace]",[Sam Raimi],[Ivan Raimi],"[dualidentity, amnesia, sandstorm, ""loveofoneslife"", forgiveness, spider, wr..."
4740,Spider-Man 2,4740,"[Action, Adventure, Fantasy]","[TobeyMaguire, KirstenDunst, JamesFranco, AlfredMolina, RosemaryHarris]",[Sam Raimi],[Alvin Sargent],"[dualidentity, ""loveofoneslife"", pizzaboy, marvelcomic, sequel, superhero, d..."
1994,Ghost Rider: Spirit of Vengeance,1994,"[Action, Fantasy, Thriller]","[NicolasCage, CiaránHinds, ViolantePlacido, FergusRiordan, JohnnyWhitworth]",[Mark Neveldine],[Scott M. Gimple],"[monk, easterneurope, skeleton, biker, marvelcomic, superhero, motorcycle, d..."
2263,Highlander: Endgame,2263,"[Action, Fantasy, ScienceFiction]","[ChristopherLambert, BrucePayne, AdrianPaul, DonnieYen, LisaBarbuscia]",[Douglas Aarniokoski],[Joel Soisson],"[lossoflover, antiquary, lossofpowers, deathofafriend, immortality]"
2142,Hancock,2142,"[Fantasy, Action]","[WillSmith, CharlizeTheron, JasonBateman, JaeHead, EddieMarsan]",[Peter Berg],[Vince Gilligan],"[flying, alcohol, ""loveofoneslife"", forbiddenlove, lovers, affection, advert..."
687,Batman,687,"[Fantasy, Action]","[JackNicholson, MichaelKeaton, KimBasinger, MichaelGough, PatHingle]",[Tim Burton],[Sam Hamm],"[doublelife, dccomics, dualidentity, chemical, crimefighter, fictionalplace,..."
5604,Thor: The Dark World,5604,"[Action, Adventure, Fantasy]","[ChrisHemsworth, NataliePortman, TomHiddleston, AnthonyHopkins, ChristopherE...",[Alan Taylor],[Christopher Yost],"[marvelcomic, superhero, basedoncomicbook, hostiletakeover, norsemythology, ..."
5603,Thor,5603,"[Adventure, Fantasy, Action]","[ChrisHemsworth, NataliePortman, TomHiddleston, AnthonyHopkins, StellanSkars...",[Kenneth Branagh],[Zack Stentz],"[newmexico, banishment, shield, marvelcomic, hammer, superhero, basedoncomic..."
5566,The Wolverine,5566,"[Action, ScienceFiction, Adventure, Fantasy]","[HughJackman, HiroyukiSanada, FamkeJanssen, WillYunLee, TaoOkamoto]",[James Mangold],[Mark Bomback],"[japan, samurai, mutant, worldwari, marvelcomic, superhero, basedoncomicbook..."
5258,The Interview,5258,"[Action, Comedy]","[JamesFranco, SethRogen, LizzyCaplan, RandallPark, DianaBang]",[Evan Goldberg],[Dan Sterling],"[cia, ""coupdetat"", northkorea, assassinationattempt, evildictator]"


In [91]:
movies["id"] = pd.Series(movies.index)

In [93]:
# Save the KNN model using pickle
import pickle

with open('../model.pkl', 'wb') as f:
    pickle.dump(model_knn, f)

In [94]:
pca_data.to_csv("data/encoded_features.csv", sep=",", index=True, index_label="id")

In [95]:
movies["original_title"].to_csv("data/movie_titles.csv", index=True, index_label="id")