# Netflix Data Analysis Project
This notebook analyzes the Netflix dataset to uncover content trends and provide strategic insights.

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots a bit prettier
plt.style.use('default')


In [None]:

# Load dataset
df = pd.read_csv("Netflix Dataset.csv")

# Display first few rows
df.head()


In [None]:

# Inspect dataset info and columns
df.info()
df.columns


In [None]:

# Rename columns for easier reference
df.rename(columns={'Category':'type', 'Title':'title', 'Country':'country',
                   'Release_Date':'release_date', 'Type':'listed_in'}, inplace=True)

# Extract release_year from release_date
import re
def extract_year(x):
    try:
        s = str(x)
        m = re.search(r"(19|20)\d{2}", s)
        if m:
            return int(m.group(0))
    except:
        return None
    return None

df['release_year'] = df['release_date'].apply(extract_year)
df['release_year'] = df['release_year'].fillna(0).astype(int)

# Fill missing values
df['country'] = df['country'].fillna("Unknown")
df['listed_in'] = df['listed_in'].fillna("Unknown")
df['type'] = df['type'].fillna("Unknown")

df.head()


## Movies vs TV Shows

In [None]:

type_counts = df['type'].value_counts()
print(type_counts)

# Pie chart
type_counts.plot.pie(autopct='%1.1f%%', figsize=(6,6), ylabel='', title="Movies vs TV Shows")
plt.show()


## Content Added per Year by Type

In [None]:

by_year_type = df.groupby(['release_year','type']).size().unstack(fill_value=0).sort_index()
by_year_type.plot(kind='line', figsize=(10,5), marker='o', title="Content Added per Year by Type")
plt.xlabel("Year"); plt.ylabel("Count")
plt.show()


## Top Genres

In [None]:

genres_series = df['listed_in'].str.split(',').explode().str.strip().dropna()
top_genres = genres_series.value_counts().head(10)
print(top_genres)

sns.barplot(x=top_genres.values, y=top_genres.index)
plt.title("Top 10 Genres")
plt.xlabel("Count")
plt.ylabel("Genre")
plt.show()


## Top Countries

In [None]:

countries_series = df['country'].str.split(',').explode().str.strip().dropna()
top_countries = countries_series.value_counts().head(10)
print(top_countries)

sns.barplot(x=top_countries.values, y=top_countries.index)
plt.title("Top 10 Countries by Content Count")
plt.xlabel("Count")
plt.ylabel("Country")
plt.show()


## Total Contents Added per Year

In [None]:

yearly_counts = df['release_year'].value_counts().sort_index()
plt.figure(figsize=(10,4))
plt.plot(yearly_counts.index, yearly_counts.values, marker='o')
plt.title("Total Contents Added per Year")
plt.xlabel("Year")
plt.ylabel("Total Content Added")
plt.show()


## Key Findings

In [None]:

print("Total records:", len(df))
print("Unique titles:", df['title'].nunique())
print("Years covered:", df['release_year'].min(), "-", df['release_year'].max())
print("Most common type:", type_counts.idxmax())
print("Top genre:", top_genres.index[0])
print("Top country:", top_countries.index[0])



## Strategic Recommendations
- Invest more in popular genres and local-language originals for high-contribution countries.  
- Fill gaps by commissioning content in underrepresented genres and regions.  
- Use yearly trend analysis to align release schedules and marketing.  
