In [2]:
# =====================================================
# 1. IMPORT LIBRARIES
# =====================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

df = pd.read_csv("netflix_titles.csv")
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
# =====================================================
# 2. BASIC DATA UNDERSTANDING
# =====================================================

df.shape
df.info()
df.describe()
df.describe(include="all")
df.describe(include="object")
df.describe(include="number")


df.columns




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [None]:
# =====================================================
# 3. DUPLICATES CHECK
# =====================================================

df.duplicated().sum()
df = df.drop_duplicates()


In [None]:
# =====================================================
# 4. MISSING VALUES ANALYSIS
# =====================================================

df.isnull().sum()
df.isnull().mean() * 100   # percentage missing


In [None]:
# =====================================================
# 5. MISSING VALUE HANDLING
# =====================================================

df["director"].fillna("Unknown", inplace=True)
df["cast"].fillna("Unknown", inplace=True)
df["country"].fillna("Unknown", inplace=True)
df["rating"].fillna("Not Rated", inplace=True)

df["date_added"] = pd.to_datetime(df["date_added"])

df.isnull().sum()


In [None]:
# =====================================================
# 6. UNIQUE VALUE ANALYSIS
# =====================================================

df["type"].unique()
df["rating"].unique()
df["country"].nunique()



In [None]:
# =====================================================
# 7. VALUE COUNTS (TABULAR EDA)
# =====================================================

df["type"].value_counts()
df["rating"].value_counts()
df["country"].value_counts().head(10)



In [None]:
# =====================================================
# 8. FILTERING (MOVIES AFTER 2015)
# =====================================================

movies_after_2015 = df[(df["type"] == "Movie") & (df["release_year"] > 2015)]

movies_after_2015.head()


In [None]:
# =====================================================
# 9. TV SHOWS ONLY DATASET
# =====================================================

tv_shows = df[df["type"] == "TV Show"]

tv_shows.head()


In [None]:
# =====================================================
# 10. FEATURE ENGINEERING
# =====================================================

df["year_added"] = df["date_added"].dt.year
df["month_added"] = df["date_added"].dt.month

df.head()


In [None]:
# =====================================================
# 11. COUNT OF MOVIES VS TV SHOWS
# =====================================================

sns.countplot(data=df, x="type")
plt.title("Count of Movies vs TV Shows")
plt.show()


In [None]:
# =====================================================
# 12. RELEASE YEAR DISTRIBUTION
# =====================================================

sns.histplot(data=df, x="release_year", bins=20)
plt.title("Distribution of Release Year")
plt.show()


In [None]:
# =====================================================
# 13. CONTENT TREND OVER YEARS
# =====================================================

year_counts = df["release_year"].value_counts().sort_index()

sns.lineplot(x=year_counts.index, y=year_counts.values)
plt.title("Netflix Content Trend Over Years")
plt.xlabel("Year")
plt.ylabel("Number of Shows")
plt.show()


In [None]:
# =====================================================
# 14. TOP 10 COUNTRIES
# =====================================================

top_countries = df["country"].value_counts().head(10)

sns.countplot(
    data=df[df["country"].isin(top_countries.index)],
    y="country",
    order=top_countries.index
)

plt.title("Top 10 Countries")
plt.show()


In [None]:
# =====================================================
# 15. RATING DISTRIBUTION
# =====================================================

sns.countplot(
    data=df,
    y="rating",
    order=df["rating"].value_counts().index
)

plt.title("Rating Distribution")
plt.show()


In [None]:
# =====================================================
# 16. BOXPLOT: RELEASE YEAR BY TYPE
# =====================================================

sns.boxplot(data=df, x="type", y="release_year")
plt.title("Release Year Distribution by Type")
plt.show()


In [None]:
# =====================================================
# 17. CORRELATION ANALYSIS
# =====================================================

numeric_df = df.select_dtypes(include="number")
corr = numeric_df.corr()

sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()
