In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from google.colab import files
import shutil
import os

#loading the data
df=pd.read_csv("netflix_titles.csv")
print(df.info())

#cleaning the data
df = df.dropna(subset=['type','release_year','rating','duration','country'])

#To know about Number Of Movies VS TV Shows on Netflix
type_counts = df['type'].value_counts()
plt.figure(figsize=(6,4))
plt.bar(type_counts.index, type_counts, color=['skyblue','orange'])
plt.title("Number Of Movies VS TV Shows on Netflix")
plt.xlabel('Type')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('Movies_vs_tvshows.png')
# plt.show()

#What is the % of each content(PG, R,TV-MA)
rating_counts = df['rating'].value_counts()
# content_label = df['rating'].unique()
plt.figure(figsize=(8,6))
plt.pie(rating_counts, labels = rating_counts.index, autopct="%1.1f%%")
plt.title("The Percentage(%) of content rating(PG, R,TV-MA)")
plt.tight_layout()
plt.savefig('Content_rating.png')
# plt.show()

#How has the no.of releases changed over the years?
year_counts = df['release_year'].value_counts()
plt.figure(figsize=(6,4))
plt.plot(year_counts.index, year_counts, linestyle='--', marker='o',label='No.of content released')
plt.xlim(2000,2025)
plt.grid(color='grey', linestyle=':')
plt.legend()
plt.xlabel("Release Year")
plt.ylabel("Number of content released")
plt.title("Change in the no.of content releases over the years")
plt.tight_layout()
plt.savefig('Content_release_year.png')
#plt.show()

# What is the distribution of movie duration?
movie_df = df[df['type']=='Movie'].copy()
movie_df['duration_int'] = movie_df['duration'].str.replace('min','',regex=False).astype(int)
# print(movie_df['duration_int'])
plt.figure(figsize=(8,6))
plt.hist(movie_df['duration_int'], bins=30, color="coral", edgecolor='black')
plt.title("Distribution of movie duration")
plt.xlabel("Movie Duration")
plt.ylabel("Number of movies")
plt.tight_layout()
plt.savefig('Movie_Duration_Histogram.png')
# plt.show()

# Relo btw release year & no.of shows?
release_counts = df['release_year'].value_counts().sort_index()
plt.figure(figsize=(8,6))
plt.scatter(release_counts.index, release_counts.values, color="blue", marker='o', label='No.of Shows')
plt.xlabel("Release Year")
plt.ylabel("Number of Shows")
plt.title("Release year V/S no.of shows")
plt.legend()
plt.grid(color='grey', linestyle=':')
plt.tight_layout()
plt.savefig('year_vs_shows.png')
#plt.show()

# Top 10 countries with highest no.of shows?
country_df = df['country'].value_counts().head(10)
plt.figure(figsize=(8,6))
plt.barh(country_df.index, country_df.values, color='skyblue', edgecolor='teal')
plt.xlabel('Number of Shows')
plt.ylabel('Country')
plt.title('Top 10 countries with highest no.of shows')
plt.tight_layout()
plt.savefig('Top10_Country.png')
plt.show()

# Compare multiple plots together (eg: movies vs tv shows by year)
content = df.groupby(['release_year','type']).size().unstack().fillna(0).astype(int)
fig, ax= plt.subplots(1, 2, figsize=(12,5))
#movies per year
ax[0].plot(content.index, content['Movie'], color='blue')
ax[0].set_title("Movies")
ax[0].set_xlabel('Year')
ax[0].set_ylabel('Number of Movies')
#shows per year
ax[1].plot(content.index, content['TV Show'], color='orange')
ax[1].set_title("TV Shows")
ax[1].set_xlabel('Year')
ax[1].set_ylabel('Number of Shows')
#entire plot customization
fig.suptitle("Movies V/S TV Shows By Year")
plt.tight_layout()
plt.savefig('Movies_VS_TVshows_Subplot.png')
# plt.show()

image_list = [
    'Movies_vs_tvshows.png',
    'Content_rating.png',
    'Content_release_year.png',
    'Movie_Duration_Histogram.png',
    'year_vs_shows.png',
    'Top10_Country.png',
    'Movies_VS_TVshows_Subplot.png'
]

images = [Image.open(img).convert('RGB') for img in image_list]

if images:
    images[0].save("Netflix_Data_Visualizations.pdf", save_all=True, append_images=images[1:])

# Create a folder to bundle everything
os.makedirs("netflix_outputs", exist_ok=True)

# Move all files into the folder
for file in image_list + ["Netflix_Data_Visualizations.pdf", "netflix_titles.csv"]:
    shutil.copy(file, "netflix_outputs")

# Zip the folder
shutil.make_archive("netflix_outputs", 'zip', "netflix_outputs")

# Download the zip to your laptop
files.download("netflix_outputs.zip")

FileNotFoundError: [Errno 2] No such file or directory: 'netflix_titles.csv'