In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [None]:
df = pd.read_csv("./netflix_titles.csv", encoding='ISO-8859-1')
df.head()

In [None]:
df.drop(columns=[col for col in df.columns if 'Unnamed' in col], inplace=True)
df.dropna(subset=['country', 'rating'], inplace=True)  # Dropping rows where 'country' or 'rating' is missing

In [None]:
# Plotting the distribution of Movies vs. TV Shows

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='type')
plt.title('Distribution of Netflix Content Types')
plt.xlabel('Content Type')
plt.ylabel('Count')
plt.show()

In [None]:
#Distribution of Relase years

plt.hist(df['release_year'], bins=10, edgecolor='black')
plt.title('Analyzing Distribution of Release Years for Titles on Netflix')
plt.xlabel('Release Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
# Analyze top countries contributing content
top_countries = df['country'].str.split(', ').explode().value_counts().head(10)
top_countries.plot(kind='barh', figsize=(10, 6))
plt.title('Top 10 Countries Contributing to Netflix Content')
plt.xlabel('Number of Titles')
plt.ylabel('Country')
plt.gca().invert_yaxis()
plt.show()


In [None]:
def extract_minutes(duration_str):
  # Extract and convert to int (handle potential errors)
  try:
    if(type(duration_str) != float):
      return int(duration_str.split()[0])
  except ValueError:
    return 0  # Or handle errors differently (e.g., raise exception)

movies_df = df[df['type'] == 'Movie']  # Filter for movies (if applicable)
movies_df['duration_minutes'] = movies_df['duration'].apply(extract_minutes)

# Plot the histogram using the new numeric column
plt.hist(movies_df['duration_minutes'], bins=30, edgecolor='black')
plt.xticks(range(int(min(movies_df['duration_minutes'])), int(max(movies_df['duration_minutes'])) + 1, 20))
# Prepare the plot
plt.title('Distribution of Movie Runtimes on Netflix (in minutes)')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
def clean_and_explode(data, col):
    return data[col].str.split(', ').explode().value_counts()

plt.figure(figsize=(14, 8))
movies_genres = clean_and_explode(df[df['type'] == 'Movie'], 'listed_in')
tv_genres = clean_and_explode(df[df['type'] == 'TV Show'], 'listed_in')
movies_genres.head(10).plot(kind='bar', position=1, color='blue', width=0.4, label='Movies', alpha=0.7)
tv_genres.head(10).plot(kind='bar', position=0, color='green', width=0.4, label='TV Shows', alpha=0.7)
plt.title('Top 10 Popular Genres for Movies and TV Shows')
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
content_by_country = df.groupby(['country', 'type']).size().unstack(fill_value=0)

top_n_countries = 10
content_by_country = content_by_country.nlargest(top_n_countries, 'Movie')

content_by_country.plot(kind='bar', stacked=True, figsize=(14, 7))
plt.title('Distribution of Content Types by Top Countries')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
