In [4]:
# ✅ DATA CLEANING AND PREPROCESSING FOR NETFLIX DATASET
import pandas as pd

# Step 1: Load the dataset from the specified path
df = pd.read_csv('/content/netflix_titles.csv')

# Step 2: Initial overview of the dataset
print("🔍 Original Dataset Info:")
print(df.info())

# ============================
# ✅ 1. Handle Missing Values
# ============================
print("\n🔍 Missing values per column:\n", df.isnull().sum())

# Drop rows where title is missing
df = df.dropna(subset=['title'])

# Fill other missing values with 'unknown'
df.fillna('unknown', inplace=True)

# ===============================
# ✅ 2. Remove Duplicate Rows
# ===============================
df = df.drop_duplicates()

# ===========================================
# ✅ 3. Standardize Text Values (like country)
# ===========================================
text_columns = df.select_dtypes(include='object').columns
for col in text_columns:
    df[col] = df[col].str.lower().str.strip()

# ==================================================
# ✅ 4. Convert Date Format to Consistent (dd-mm-yyyy)
# ==================================================
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', dayfirst=True)

# =====================================================
# ✅ 5. Rename Columns (lowercase, underscores instead of spaces)
# =====================================================
df.columns = df.columns.str.lower().str.replace(' ', '_')

# =============================================
# ✅ 6. Check & Fix Data Types (e.g., year as int)
# =============================================
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')

# Step 7: Save the cleaned dataset
cleaned_file = '/content/cleaned_netflix_titles.csv'
df.to_csv(cleaned_file, index=False)
print(f"\n✅ Cleaned dataset saved as: {cleaned_file}")

# Final Summary
print("\n🧾 Cleaned Dataset Info:")
print(df.info())
print("\n📌 Sample Data:")
print(df.head())


🔍 Original Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None

🔍 Missing values per column:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
du

  df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', dayfirst=True)



✅ Cleaned dataset saved as: /content/cleaned_netflix_titles.csv

🧾 Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      8807 non-null   object        
 4   cast          8807 non-null   object        
 5   country       8807 non-null   object        
 6   date_added    8797 non-null   datetime64[ns]
 7   release_year  8807 non-null   Int64         
 8   rating        8807 non-null   object        
 9   duration      8807 non-null   object        
 10  listed_in     8807 non-null   object        
 11  description   8807 non-null   object        
dtypes: Int64(1), datetime64[ns](1), object(10)
memory usage: 834.4+ KB
None

📌 Sample Data:
  show_id 