# Netflix Titles Exploratory Data Analysis


This notebook explores the Netflix Movies and TV Shows dataset using Python.
It involves data loading, cleaning, transformation, visualization, and extracting key insights.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set(style="whitegrid")


## Load the Dataset

In [None]:
df = pd.read_csv("../data/netflix_titles.csv")
df.head()

## Dataset Overview

In [None]:
print("Shape of the dataset:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nInfo:")
df.info()

## Handle Missing Values

In [None]:
# Drop rows with few missing entries
df = df.dropna(subset=['duration', 'rating', 'date_added'])

# Fill remaining missing entries
df['country'].fillna('Unknown', inplace=True)
df['cast'].fillna('Not Provided', inplace=True)
df['director'].fillna('Not Provided', inplace=True)


## Convert and Extract Date Info

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), errors='coerce')
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df[['date_added', 'year_added', 'month_added']].head()

## Column Types and Unique Values

In [None]:
print("Data Types:\n", df.dtypes)
print("\nUnique Values per Column:\n")
print(df.nunique().sort_values(ascending=False))

## Content Type Distribution

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="type", palette="pastel")
plt.title("Count of Movies vs TV Shows on Netflix")
plt.xlabel("Content Type")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

## Titles Added Over Time

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, y="year_added", order=sorted(df['year_added'].dropna().unique()), palette="pastel")
plt.title("Number of Titles Added to Netflix Each Year")
plt.xlabel("Count")
plt.ylabel("Year Added")
plt.tight_layout()
plt.show()

## Content Ratings

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=df, y="rating", order=df['rating'].value_counts().index, palette="pastel")
plt.title("Distribution of Content Ratings on Netflix")
plt.xlabel("Count")
plt.ylabel("Rating")
plt.tight_layout()
plt.show()

## Top Countries by Content Count

In [None]:
top_countries = df['country'].value_counts().head(10)
plt.figure(figsize=(10, 5))
sns.barplot(x=top_countries.values, y=top_countries.index, palette="pastel")
plt.title("Top 10 Countries by Content Count on Netflix")
plt.xlabel("Number of Titles")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

## Genre Frequency Analysis

In [None]:
all_genres = df['listed_in'].str.split(', ').explode()
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count'])
genre_df = genre_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

plt.figure(figsize=(10, 6))
sns.barplot(data=genre_df.head(15), x='Count', y='Genre', palette="pastel")
plt.title("Top 15 Most Common Netflix Genres")
plt.xlabel("Number of Titles")
plt.ylabel("Genre")
plt.tight_layout()
plt.show()

## Duration Analysis

In [None]:
movies_df = df[df['type'] == 'Movie'].copy()
tv_df = df[df['type'] == 'TV Show'].copy()

movies_df['duration_int'] = movies_df['duration'].str.extract('(\d+)').astype(int)
tv_df['duration_int'] = tv_df['duration'].str.extract('(\d+)').astype(int)

plt.figure(figsize=(10, 5))
sns.histplot(movies_df['duration_int'], bins=30, kde=True, color='skyblue')
plt.title("Distribution of Movie Durations on Netflix")
plt.xlabel("Duration (minutes)")
plt.ylabel("Number of Movies")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 5))
sns.countplot(data=tv_df, x='duration_int', order=sorted(tv_df['duration_int'].unique()), palette='pastel')
plt.title("Number of Seasons in Netflix TV Shows")
plt.xlabel("Number of Seasons")
plt.ylabel("Number of TV Shows")
plt.tight_layout()
plt.show()