# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import os

# Loading the Movie Lens Data

In [2]:
base_dir = os.path.join('..', 'data', 'raw_data', 'ml-latest-small')

links_path = os.path.join(base_dir, 'links.csv')
movies_path = os.path.join(base_dir, 'movies.csv')

links_df = pd.read_csv(links_path)
movies_df = pd.read_csv(movies_path)

In [3]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# First merge movies with links to get TMDB and IMDB IDs
movielens_df = pd.merge(movies_df, links_df, on='movieId', how='left')

# Display first few rows of final merged dataset
print("Shape of final merged dataset:", movielens_df.shape)
movielens_df.head()


Shape of final merged dataset: (9742, 5)


Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [6]:
# Convert tmdbId to integer type, handling any NaN values first
movielens_df['tmdbId'] = movielens_df['tmdbId'].fillna(-1).astype(int)


In [7]:
# Save the processed DataFrame to a CSV file
processed_dir = os.path.join('..', 'data', 'processed_data')
os.makedirs(processed_dir, exist_ok=True)

processed_path = os.path.join(processed_dir, 'movielens_content_processed.csv')
movielens_df.to_csv(processed_path, index=False)

# Movie Content Data

In [3]:
base_dir = os.path.join('..', 'data', 'raw_data')
path = os.path.join(base_dir, 'movies_data_final.csv')
movies_df = pd.read_csv(path)

In [4]:
movies_df.head()

Unnamed: 0,title,release_date,revenue,runtime,imdb_id,original_language,overview,production_companies,spoken_languages,plot_summary,plot_synopsis,genres,cast,directors,averageRating,numVotes,release_year,release_month,release_day
0,Inception,2010-07-15,825532764,148,tt1375666,English,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili","Dom Cobb is a skilled thief, the absolute best...","Dominick ""Dom"" Cobb (Leonardo DiCaprio) and bu...","Action, Adventure, Sci-Fi, Thriller","Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",Christopher Nolan,8.8,2681459,2010,7,15
1,Interstellar,2014-11-05,701729206,169,tt0816692,English,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English,In the near future around the American Midwest...,"In the future, crop blight has caused civiliza...","Adventure, Drama, Sci-Fi","Ellen Burstyn, Matthew McConaughey, Mackenzie ...",Christopher Nolan,8.7,2342692,2014,11,5
2,The Dark Knight,2008-07-16,1004558444,152,tt0468569,English,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin",Set within a year after the events of Batman B...,A gang of 6 criminals rob a Gotham City mob ba...,"Action, Crime, Drama, Thriller","Christian Bale, Heath Ledger, Aaron Eckhart, M...",Christopher Nolan,9.0,3018672,2008,7,16
3,Avatar,2009-12-15,2923706026,162,tt0499549,English,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish","When his brother is killed in a robbery, parap...","In 2154, humans have depleted Earth's natural ...","Action, Adventure, Fantasy, Sci-Fi","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron,7.9,1430332,2009,12,15
4,The Avengers,2012-04-25,1518815515,143,tt0848228,English,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian","Loki, the adopted brother of Thor, teams-up wi...",The Asgardian Loki (Tom Hiddleston) encounters...,"Action, Sci-Fi","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon,8.0,1507612,2012,4,25


In [5]:
movies_df.dtypes

title                    object
release_date             object
revenue                   int64
runtime                   int64
imdb_id                  object
original_language        object
overview                 object
production_companies     object
spoken_languages         object
plot_summary             object
plot_synopsis            object
genres                   object
cast                     object
directors                object
averageRating           float64
numVotes                  int64
release_year              int64
release_month             int64
release_day               int64
dtype: object

In [6]:
movies_df['imdb_id'] = movies_df['imdb_id'].str.replace('tt', '')
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')

In [7]:
movies_df.shape

(18506, 19)

In [8]:
duplicate_imdb_ids = movies_df[movies_df.duplicated(subset='imdb_id', keep=False)]
print(f"Number of duplicate imdb_id entries: {len(duplicate_imdb_ids)}")

Number of duplicate imdb_id entries: 0


In [9]:
# Filter movies released before 2000
movies_before_2000 = movies_df[movies_df['release_date'] < '2000-01-01']

# Count the number of such movies
print(f"Number of movies released before 2000: {len(movies_before_2000)}")

Number of movies released before 2000: 6161


In [10]:
# Filter for movies released after 2000
print("Total rows before filtering by release date:", len(movies_df))
# Filter for movies after 2000
movies_df = movies_df[movies_df['release_date'] > '2000-01-01']
print("Total rows after filtering by release date:", len(movies_df))

Total rows before filtering by release date: 18506
Total rows after filtering by release date: 12344


In [11]:
movies_df.isna().sum()

title                      0
release_date               0
revenue                    0
runtime                    0
imdb_id                    0
original_language          0
overview                   0
production_companies     362
spoken_languages          64
plot_summary               0
plot_synopsis           1149
genres                     0
cast                      20
directors                 47
averageRating              0
numVotes                   0
release_year               0
release_month              0
release_day                0
dtype: int64

In [12]:
movies_df = movies_df.dropna()

In [13]:
movies_df.isna().sum()

title                   0
release_date            0
revenue                 0
runtime                 0
imdb_id                 0
original_language       0
overview                0
production_companies    0
spoken_languages        0
plot_summary            0
plot_synopsis           0
genres                  0
cast                    0
directors               0
averageRating           0
numVotes                0
release_year            0
release_month           0
release_day             0
dtype: int64

In [14]:
movies_df.shape

(10922, 19)

In [15]:
# Save the processed DataFrame to a CSV file
processed_dir = os.path.join('..', 'data', 'processed_data')
os.makedirs(processed_dir, exist_ok=True)

processed_path = os.path.join(processed_dir, 'tmdb_content_processed.csv')
movies_df.to_csv(processed_path, index=False)