In [None]:
# Upload Netflix dataset
from google.colab import files
uploaded = files.upload()

import pandas as pd
netflix_df = pd.read_csv("netflix_titles.csv")
netflix_df.head()

In [None]:
# Upload IMDb dataset
uploaded = files.upload()
imdb_df = pd.read_csv("imdb_top_1000.csv")
imdb_df.head()

In [None]:
# Normalize release year column name
netflix_df.rename(columns={'release_year': 'Released_Year'}, inplace=True)

In [None]:
# Clean title for merging
netflix_df['title_clean'] = netflix_df['title'].str.lower().str.strip()
imdb_df['title_clean'] = imdb_df['Series_Title'].str.lower().str.strip()

In [None]:
# Merge datasets on cleaned title
merged_df = pd.merge(netflix_df, imdb_df, on="title_clean", how="inner")

# Fix duplicate Released_Year columns
merged_df.drop(columns=['Released_Year_x'], inplace=True)
merged_df.rename(columns={'Released_Year_y': 'release_year'}, inplace=True)


In [None]:
merged_df.drop(columns=['Genre_x'], errors='ignore', inplace=True)
merged_df.rename(columns={'Genre_y': 'Genre'}, inplace=True)

In [None]:
merged_df['Genre'] = merged_df['Genre'].str.split(', ')
merged_df = merged_df.explode('Genre')
merged_df['Genre'] = merged_df['Genre'].str.strip().str.lower()

In [None]:
# Preview merged result
pd.set_option('display.max_columns', None)
merged_df.head()

In [None]:
# Check number of matched rows
print("Merged rows:", merged_df.shape[0])

In [None]:
# Create 'hit' label based on IMDb rating â‰¥ 8.0
merged_df['hit'] = (merged_df['IMDB_Rating'] >= 8.0).astype(int)

In [None]:
# Check missing values
merged_df.isnull().sum()

In [None]:
# Export cleaned and merged dataset
merged_df.to_csv("netflix_merged.csv", index=False)