In [None]:
# =========================================================
#  1. IMPORT LIBRARIES
# =========================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

# Set global plot style
sns.set_theme(style="whitegrid")


# =========================================================
#  2. LOAD DATASET
# =========================================================
df = pd.read_csv("netflix_titles.csv")
print("Dataset Shape:", df.shape)
df.head()


# =========================================================
#  3. DATA OVERVIEW
# =========================================================
df.info()
df.describe(include="all")


# =========================================================
#  4. DATA CLEANING
# =========================================================

# Check missing values
print("Missing Values Before Cleaning:\n", df.isnull().sum())

# Fill missing categorical values
df["director"] = df["director"].fillna("Unknown")
df["cast"] = df["cast"].fillna("Unknown")
df["country"] = df["country"].fillna("Unknown")

# Drop rows where critical info is missing
df.dropna(subset=["date_added", "rating", "duration"], inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

print("Missing Values After Cleaning:\n", df.isnull().sum())


# =========================================================
#  5. FEATURE ENGINEERING
# =========================================================

# Convert date_added to datetime
df["date_added"] = pd.to_datetime(df["date_added"], format='mixed')

# Extract year for trend analysis
df["year_added"] = df["date_added"].dt.year


# =========================================================
#  6. UNIVARIATE ANALYSIS
# =========================================================

# Distribution of Movies vs TV Shows
sns.countplot(x="type", data=df, palette="Set2")
plt.title("Distribution of Movies vs TV Shows")
plt.show()

# Content added over years
sns.histplot(df["year_added"], bins=15, kde=True)
plt.title("Content Added Over Years")
plt.show()

# Ratings distribution
plt.figure(figsize=(10,5))
sns.countplot(y="rating", data=df, order=df["rating"].value_counts().index)
plt.title("Content Rating Distribution")
plt.show()


# =========================================================
#  7. GENRE ANALYSIS
# =========================================================

# Split genres and count frequency
genres = df["listed_in"].str.split(", ")
all_genres = sum(genres, [])
genre_counts = Counter(all_genres)

genre_df = pd.DataFrame(genre_counts.items(), columns=["Genre", "Count"])
genre_df = genre_df.sort_values(by="Count", ascending=False).head(10)

sns.barplot(x="Count", y="Genre", data=genre_df, palette="viridis")
plt.title("Top 10 Genres on Netflix")
plt.show()


# =========================================================
#  8. BIVARIATE ANALYSIS
# =========================================================

# Movies vs TV Shows added each year
sns.countplot(x="year_added", hue="type", data=df, palette="Set1")
plt.xticks(rotation=45)
plt.title("Movies vs TV Shows Added Each Year")
plt.show()

# Top content producing countries
top_countries = df["country"].value_counts().head(10).index
sns.countplot(y="country", data=df[df["country"].isin(top_countries)], hue="country",  palette="magma")
plt.title("Top 10 Content Producing Countries")
plt.show()

# Rating vs Type
sns.countplot(x="rating", hue="type", data=df, palette="pastel")
plt.xticks(rotation=45)
plt.title("Ratings Distribution by Content Type")
plt.show()


# =========================================================
#  9. CORRELATION ANALYSIS
# =========================================================

corr = df.select_dtypes(include="number").corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


# =========================================================
#  10. CONTENT GROWTH TREND
# =========================================================

df.groupby("year_added")["type"].count().plot(color="purple")
plt.title("Total Content Added Over Years")
plt.xlabel("Year")
plt.ylabel("Number of Titles")
plt.show()


# =========================================================
#  11. OUTLIER DETECTION â€” MOVIE DURATION
# =========================================================

# Filter only movies
movie_df = df[df["type"] == "Movie"].copy()

# Convert duration from text (e.g., "90 min") to numeric minutes
movie_df["duration"] = movie_df["duration"].str.replace(" min", "").astype(int)

print("Movies Duration Boxplot")
sns.boxplot(x=movie_df["duration"], color="skyblue")
plt.title("Outliers in Movie Duration")
plt.show()


# =========================================================
#  12. STATISTICAL INSIGHT
# =========================================================

# Average release year using NumPy
avg_year = np.mean(df["release_year"])
print("Average Release Year of Content:", avg_year)