In [1]:
# ======================
# 1. SETUP & DATA LOADING
# ======================
!pip install pandas seaborn matplotlib plotly wordcloud
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter



In [None]:
# Load dataset (Netflix Movies and TV Shows)
url = "https://raw.githubusercontent.com/krishnaik06/netflix-data-analysis/main/netflix_titles.csv"
df = pd.read_csv(url)

In [None]:
# ======================
# 2. DATA CLEANING
# ======================
# Handle missing values
df['country'] = df['country'].fillna('Unknown')
df['cast'] = df['cast'].fillna('No Cast')
df['director'] = df['director'].fillna('No Director')

# Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), errors='coerce')

# Extract year added
df['year_added'] = df['date_added'].dt.year


In [None]:
# ======================
# 3. CONTENT OVERVIEW
# ======================
plt.figure(figsize=(10,6))
sns.countplot(x='type', data=df, palette=['#E50914', '#221F1F'])
plt.title('Movies vs TV Shows')
plt.show()


In [None]:
# ======================
# 4. RELEASE TRENDS
# ======================
plt.figure(figsize=(14,6))
df['release_year'].value_counts().sort_index().plot(kind='line', color='#E50914')
plt.title('Content Release Over Time')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.grid()
plt.show()

In [None]:
# ======================
# 5. COUNTRY ANALYSIS
# ======================
# Top 10 countries
country_counts = df['country'].str.split(', ').explode().value_counts().head(10)

plt.figure(figsize=(12,6))
sns.barplot(x=country_counts.values, y=country_counts.index, palette='Reds_r')
plt.title('Top 10 Countries by Content Production')
plt.show()

In [None]:
# ======================
# 6. GENRE ANALYSIS
# ======================
# Word cloud for genres
genres = ' '.join(df['listed_in'].fillna(''))
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(genres)

plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Popular Genres on Netflix')
plt.show()


In [None]:
# ======================
# 7. DURATION ANALYSIS
# ======================
# Movies duration distribution
movies = df[df['type'] == 'Movie']
movies['duration'] = movies['duration'].str.extract('(\d+)').astype(float)

plt.figure(figsize=(12,6))
sns.histplot(data=movies, x='duration', bins=30, kde=True, color='#E50914')
plt.title('Distribution of Movie Durations (minutes)')
plt.show()

# TV Show seasons
tv_shows = df[df['type'] == 'TV Show']
tv_shows['seasons'] = tv_shows['duration'].str.extract('(\d+)').astype(float)

plt.figure(figsize=(12,6))
sns.countplot(data=tv_shows, x='seasons', palette='Reds')
plt.title('Number of TV Show Seasons')
plt.show()

In [None]:
# ======================
# 8. ADDITION TRENDS
# ======================
# Content added by year
plt.figure(figsize=(14,6))
df['year_added'].value_counts().sort_index().plot(kind='bar', color='#E50914')
plt.title('Content Added to Netflix by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

# Monthly addition pattern
df['month_added'] = df['date_added'].dt.month
monthly = df.groupby('month_added')['show_id'].count()

plt.figure(figsize=(12,6))
sns.lineplot(x=monthly.index, y=monthly.values, color='#E50914', marker='o')
plt.title('Monthly Content Addition Pattern')
plt.xticks(range(1,13), ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.grid()
plt.show()

In [None]:
# ======================
# 9. RATING ANALYSIS
# ======================
plt.figure(figsize=(14,6))
rating_order = df['rating'].value_counts().index
sns.countplot(data=df, y='rating', order=rating_order, palette='Reds_r')
plt.title('Content Ratings Distribution')
plt.show()

In [None]:
# ======================
# 10. CAST ANALYSIS
# ======================
# Top 20 actors
all_cast = ' '.join(df['cast'].str.replace(', ', ',').str.split(',').dropna().explode())
top_actors = pd.Series(all_cast.split()).value_counts().head(20)

plt.figure(figsize=(12,8))
sns.barplot(x=top_actors.values, y=top_actors.index, palette='Reds_r')
plt.title('Top 20 Actors on Netflix')
plt.show()