# Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import os

# Preparing data for vector embeddings

In [9]:
base_dir = os.path.join('..', 'data')
tmdb_df = pd.read_csv(base_dir+'/processed_data/tmdb_content_processed.csv')

In [10]:
tmdb_df.head()

Unnamed: 0,title,release_date,revenue,runtime,imdb_id,original_language,overview,production_companies,spoken_languages,plot_summary,plot_synopsis,genres,cast,directors,averageRating,numVotes,release_year,release_month,release_day
0,Inception,2010-07-15,825532764,148,1375666,English,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","Dom Cobb is a skilled thief, the absolute best...","Dominick ""Dom"" Cobb (Leonardo DiCaprio) and bu...","Action, Adventure, Sci-Fi, Thriller","Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",Christopher Nolan,8.8,2681459,2010,7,15
1,Interstellar,2014-11-05,701729206,169,816692,English,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English,In the near future around the American Midwest...,"In the future, crop blight has caused civiliza...","Adventure, Drama, Sci-Fi","Ellen Burstyn, Matthew McConaughey, Mackenzie ...",Christopher Nolan,8.7,2342692,2014,11,5
2,The Dark Knight,2008-07-16,1004558444,152,468569,English,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin",Set within a year after the events of Batman B...,A gang of 6 criminals rob a Gotham City mob ba...,"Action, Crime, Drama, Thriller","Christian Bale, Heath Ledger, Aaron Eckhart, M...",Christopher Nolan,9.0,3018672,2008,7,16
3,Avatar,2009-12-15,2923706026,162,499549,English,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","When his brother is killed in a robbery, parap...","In 2154, humans have depleted Earth's natural ...","Action, Adventure, Fantasy, Sci-Fi","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron,7.9,1430332,2009,12,15
4,The Avengers,2012-04-25,1518815515,143,848228,English,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian","Loki, the adopted brother of Thor, teams-up wi...",The Asgardian Loki (Tom Hiddleston) encounters...,"Action, Sci-Fi","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon,8.0,1507612,2012,4,25


Data Analysis

In [11]:
# Count movies per original language and sort in descending order
language_counts = tmdb_df['original_language'].value_counts()

# Get top 10 languages
top_10_languages = language_counts.head(10)


# Display counts
print("\nTop 10 languages by number of movies:")
print(top_10_languages)



Top 10 languages by number of movies:
original_language
English     7904
Hindi        672
French       403
Japanese     234
Spanish      228
Tamil        163
Korean       162
Chinese      144
German       131
Telugu       102
Name: count, dtype: int64


Creating dataset for content based recommendation

In [14]:
df_temp = tmdb_df[['imdb_id', 'title', 'release_date', 'plot_synopsis', 'genres', 'cast', 'directors']].copy()

In [15]:
df_temp[['genres', 'cast', 'directors']] = df_temp[['genres', 'cast', 'directors']].apply(lambda col: col.str.replace(r'(?<=\w)\s+(?=\w)', '', regex=True))

In [16]:
# Combine the columns into a single column called 'wordsoup', limiting cast to top 10
df_temp['wordsoup'] = df_temp['plot_synopsis'] + ' ' + df_temp['genres'] + ' ' + df_temp['cast'].apply(lambda x: ', '.join(x.split(',')[:10])) + ' ' + df_temp['directors']

In [17]:
df_temp.head()

Unnamed: 0,imdb_id,title,release_date,plot_synopsis,genres,cast,directors,wordsoup
0,1375666,Inception,2010-07-15,"Dominick ""Dom"" Cobb (Leonardo DiCaprio) and bu...","Action, Adventure, Sci-Fi, Thriller","LeonardoDiCaprio, JosephGordon-Levitt, ElliotP...",ChristopherNolan,"Dominick ""Dom"" Cobb (Leonardo DiCaprio) and bu..."
1,816692,Interstellar,2014-11-05,"In the future, crop blight has caused civiliza...","Adventure, Drama, Sci-Fi","EllenBurstyn, MatthewMcConaughey, MackenzieFoy...",ChristopherNolan,"In the future, crop blight has caused civiliza..."
2,468569,The Dark Knight,2008-07-16,A gang of 6 criminals rob a Gotham City mob ba...,"Action, Crime, Drama, Thriller","ChristianBale, HeathLedger, AaronEckhart, Mich...",ChristopherNolan,A gang of 6 criminals rob a Gotham City mob ba...
3,499549,Avatar,2009-12-15,"In 2154, humans have depleted Earth's natural ...","Action, Adventure, Fantasy, Sci-Fi","SamWorthington, ZoeSaldaña, SigourneyWeaver, S...",JamesCameron,"In 2154, humans have depleted Earth's natural ..."
4,848228,The Avengers,2012-04-25,The Asgardian Loki (Tom Hiddleston) encounters...,"Action, Sci-Fi","RobertDowneyJr., ChrisEvans, MarkRuffalo, Chri...",JossWhedon,The Asgardian Loki (Tom Hiddleston) encounters...


In [18]:
# Convert to lowercase and remove extra spaces
df_temp['wordsoup'] = df_temp['wordsoup'].str.lower().str.strip().replace(r'\s+', ' ', regex=True)

# Display the updated DataFrame
df_temp[['wordsoup']].head()

Unnamed: 0,wordsoup
0,"dominick ""dom"" cobb (leonardo dicaprio) and bu..."
1,"in the future, crop blight has caused civiliza..."
2,a gang of 6 criminals rob a gotham city mob ba...
3,"in 2154, humans have depleted earth's natural ..."
4,the asgardian loki (tom hiddleston) encounters...


In [19]:
final_df = df_temp[['imdb_id', 'title', 'wordsoup']].copy()
final_df.head()

Unnamed: 0,imdb_id,title,wordsoup
0,1375666,Inception,"dominick ""dom"" cobb (leonardo dicaprio) and bu..."
1,816692,Interstellar,"in the future, crop blight has caused civiliza..."
2,468569,The Dark Knight,a gang of 6 criminals rob a gotham city mob ba...
3,499549,Avatar,"in 2154, humans have depleted earth's natural ..."
4,848228,The Avengers,the asgardian loki (tom hiddleston) encounters...


In [20]:
# Remove commas and special characters from the 'wordsoup' column
final_df['wordsoup'] = final_df['wordsoup'].str.replace(r'[^A-Za-z0-9\s]', '', regex=True)

# Display the updated DataFrame
final_df[['wordsoup']].head()

Unnamed: 0,wordsoup
0,dominick dom cobb leonardo dicaprio and busine...
1,in the future crop blight has caused civilizat...
2,a gang of 6 criminals rob a gotham city mob ba...
3,in 2154 humans have depleted earths natural re...
4,the asgardian loki tom hiddleston encounters t...


In [21]:
print(final_df['wordsoup'][0])

dominick dom cobb leonardo dicaprio and business partner arthur joseph gordonlevitt are extractors people who perform corporate espionage using an experimental military technology to infiltrate the subconscious of their targets and extract information while experiencing shared dreaming their latest target is japanese businessman saito ken watanabe the extraction from saito fails when sabotaged by a memory of cobbs deceased wife mal marion cotillard after cobbs and arthurs associate sells them out saito reveals that he was actually auditioning the team to perform the difficult act of inception planting an idea in a persons subconsciousin order to break up the energy conglomerate of ailing competitor maurice fischer pete postlethwaite saito wants cobb to plant the idea of dissolving the company into the mind of fischers heir son robert fischer cillian murphy should cobb succeed saito tells cobb he will use his influence to clear cobb of a murder charge which will allow cobb to return hom

In [22]:
# Save the final DataFrame to CSV
processed_data_dir = os.path.join(base_dir, 'processed_data')
final_df.to_csv(os.path.join(processed_data_dir, 'final_processed_content_data.csv'), index=False)