# Netflix / Media Dataset Exploratory Data Analysis (EDA)

This notebook performs an Exploratory Data Analysis (EDA) on the Netflix dataset to understand content trends, distribution, and characteristics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data
Loading the dataset from `netflix_titles.csv`.

In [None]:
filepath = 'netflix_titles.csv'
try:
    df = pd.read_csv(filepath)
    print("Dataset loaded successfully.")
    print(f"Shape: {df.shape}")
    display(df.head())
except FileNotFoundError:
    print(f"File not found: {filepath}")

## 2. Data Cleaning
Checking for missing values and handling them.
- Fill missing 'director', 'cast', 'country' with 'Unknown'.
- Drop rows with missing 'date_added' or 'rating'.
- Convert 'date_added' to datetime objects.

In [None]:
print("Missing values before cleaning:")
print(df.isnull().sum())

# Fill missing values
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)

# Drop rows with missing essential dates/ratings
df.dropna(subset=['date_added', 'rating'], inplace=True)

# Convert date_added
df['date_added'] = df['date_added'].str.strip()
df['date_added'] = pd.to_datetime(df['date_added'], format='mixed', errors='coerce')

# Extract year and month
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month_name()

print("\nMissing values after cleaning:")
print(df.isnull().sum())

## 3. Analysis

### 3.1 Movies vs TV Shows

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='type', data=df, palette='viridis')
plt.title('Distribution of Movies vs TV Shows')
plt.show()

### 3.2 Content Growth Over Time

In [None]:
data_by_year = df.groupby('year_added').size().reset_index(name='count')

plt.figure(figsize=(12, 6))
sns.lineplot(x='year_added', y='count', data=data_by_year, marker='o')
plt.title('Content Added to Netflix Over the Years')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles')
plt.show()

### 3.3 Top Genres

In [None]:
genres = df['listed_in'].str.split(', ', expand=True).stack().value_counts()

plt.figure(figsize=(12, 8))
sns.barplot(y=genres.head(10).index, x=genres.head(10).values, palette='mako')
plt.title('Top 10 Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()

### 3.4 Runtime Distribution (Movies)

In [None]:
movies = df[df['type'] == 'Movie'].copy()
movies['duration_min'] = movies['duration'].str.replace(' min', '', regex=False)
movies['duration_min'] = pd.to_numeric(movies['duration_min'], errors='coerce')

plt.figure(figsize=(12, 6))
sns.histplot(movies['duration_min'].dropna(), kde=True, color='red')
plt.title('Distribution of Movie Duration')
plt.xlabel('Duration (minutes)')
plt.show()

### 3.5 Top Release Years

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='release_year', data=df, order=df['release_year'].value_counts().index[:10], palette='rocket')
plt.title('Top 10 Release Years')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.show()

## 4. Conclusion
The analysis reveals patterns in content type, growth over time, and popular genres on Netflix.