# 📓Data Cleaning & Enrichment

## ✅**Import Libraries**

In [None]:
# Import libraries
import pandas as pd
from fuzzywuzzy import process


## 📌**Load your datasets**

In [None]:
# Load your Netflix viewing history
my_history = pd.read_csv('../data/raw/NetflixViewingHistory.csv')

# Load Netflix titles from Kaggle
netflix_df = pd.read_csv('../data/raw/netflix_titles.csv')

# Check data
my_history.head(), netflix_df.head()


## 🔎**Inspect**

### 📝*Inspect and simplify your titles*

In [None]:
# Simplify your personal viewing history titles
my_history['Simplified Title'] = my_history['Title'].apply(lambda x: x.split(':')[0].strip().lower())

# Check the simplified data
my_history.head()


### ⚙️*Inspect Kaggle dataset*

In [None]:
# Inspect Kaggle Netflix dataset columns
netflix_df = netflix_df[['title', 'listed_in', 'duration', 'rating', 'type', 'release_year']]
netflix_df['title'] = netflix_df['title'].str.lower()

netflix_df.head()


## 🗂️**Match and merge**

### 🔗*Match titles using Fuzzy Matching*

In [None]:
# Function to match titles
def match_titles(my_title, netflix_titles):
    match, score = process.extractOne(my_title, netflix_titles)
    return match if score > 85 else None

# Create matched title column
netflix_titles_list = netflix_df['title'].tolist()

my_history['Matched_Title'] = my_history['Simplified Title'].apply(
    lambda x: match_titles(x, netflix_titles=netflix_df['title'].tolist())
)

# Inspect matched titles
my_history.head(10)


### 📊*Merge your enriched data*

In [None]:
# Merge datasets on matched titles
merged_df = pd.merge(my_history, netflix_df, left_on='Matched_Title', right_on='title', how='left')

# Keep only relevant columns
merged_df = merged_df[['Date Watched', 'Title', 'Matched_Title', 'type', 'listed_in', 'duration', 'rating', 'release_year']]

# Check merged data
merged_df.head()


## 🚧**Handle Missing Data**

In [None]:
# Check how many missing matches you have
missing_data = merged_df[merged_df['Matched_Title'].isna()]
print(f"Missing matched data: {len(missing_data)} records")

# Drop missing data if minimal
merged_df_clean = merged_df.dropna(subset=['Matched_Title'])


## 📁**Export your Cleaned & Enriched Dataset**

In [None]:
# Save enriched dataset
merged_df_clean = merged_df.dropna(subset=['Matched_Title'])

merged_df.to_csv('../data/processed/enriched_netflix_history.csv', index=False)
