<a href="https://colab.research.google.com/github/LasyaDevulapalli/11239A022_DST_Lab/blob/main/11239A022_MINI_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install pandas matplotlib seaborn

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Netflix dataset
from google.colab import files
import zipfile  # Import zipfile for extraction
uploaded = files.upload()  # upload 'netflix_titles.csv (1).zip'

# Extract the uploaded zip file
with zipfile.ZipFile(next(iter(uploaded)), 'r') as zip_ref:
  zip_ref.extractall('.')  # Extract to the current directory

# Read CSV file, assuming 'netflix_titles.csv' is now extracted
df = pd.read_csv('netflix_titles.csv')

# Display basic info
print(df.shape)
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Fill missing 'rating' and 'date_added' if needed
df['rating'].fillna('Not Rated', inplace=True)
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year

# ================================
# 1️⃣ Most common genres
# ================================
# Split genres if multiple
df['listed_in'] = df['listed_in'].str.split(', ')
all_genres = df.explode('listed_in')
top_genres = all_genres['listed_in'].value_counts().head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_genres.values, y=top_genres.index, palette='viridis')
plt.title('Top 10 Genres on Netflix')
plt.xlabel('Number of Titles')
plt.show()

# ================================
# 2️⃣ Number of shows per release year
# ================================
year_count = df['release_year'].value_counts().sort_index()

plt.figure(figsize=(12,5))
sns.lineplot(x=year_count.index, y=year_count.values)
plt.title('Number of Netflix Titles by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.show()

# ================================
# 3️⃣ Distribution of ratings
# ================================
rating_count = df['rating'].value_counts()

plt.figure(figsize=(10,5))
sns.barplot(x=rating_count.index, y=rating_count.values, palette='magma')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# ================================
# BONUS: Type distribution (Movie vs TV Show)
# ================================
type_count = df['type'].value_counts()

plt.figure(figsize=(6,4))
sns.barplot(x=type_count.index, y=type_count.values, palette='Set2')
plt.title('Movies vs TV Shows on Netflix')
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()

