# Importing Libraries

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, year, explode, split

# Load Data of Netflix Content

In [None]:
# Initializes a Spark session.
spark = SparkSession.builder.appName("Netflix EDA").getOrCreate()

# Load the CSV file into a Spark DataFrame, then converts it into a Pandas DataFrame.
netflix_data = spark.read.option("header", "true").csv("netflix_data.csv")
netflix_pandas = netflix_data.toPandas()

# Loads the dataset, and prints the first 5 rows.
netflix_data = spark.read.csv("netflix_data.csv", header=True, inferSchema=True)
netflix_data.show(5)
print("\n")

# Prints some information about the table.
netflix_data.printSchema()
print("\nRow Count:", netflix_data.count())

# Distribution of Content Type

In [None]:
# Counts the number of occurrences of each content type (Movie, TV Show, etc.)
netflix_data.groupBy("type").agg(count("*").alias("count")).show()

In [None]:
# Create a pie chart for the distribution of content type.
content_counts = netflix_pandas['type'].value_counts()
colors = ['#ff7744', '#4477ff']
content_counts.plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8), colors=colors, startangle=90)

plt.title('Distribution of Content Type')
plt.show()

In [None]:
# Create a bar chart for the distribution of content type.
netflix_pandas['type'].value_counts().plot(kind='bar', title='Distribution of Content Type')
plt.show()

# Distribution of Genres

In [None]:
# Counts the number of occurrences of each genre (Dramas, Comedies, Musicals, etc.)
genres_data = netflix_data.withColumn("genre", explode(split("listed_in", ", ")))
genres_data.groupBy("genre").agg(count("*").alias("count")).orderBy("count", ascending=False).show()

In [None]:
# Create a bar chart for the distribution of genres.
content_genres = netflix_pandas['listed_in'].dropna().str.split(',').sum()
genre_frequencies = Counter(content_genres)

# Extract top 10 most frequent genres.
top_genres = pd.DataFrame(genre_frequencies.most_common(10), columns=['Genre', 'Frequency'])
top_genres.set_index('Genre').plot(kind='bar', figsize=(12, 6), color='green', edgecolor='black')

plt.title('Top Content Genres on Netflix', fontsize=16)
plt.xlabel('Genre of Content', fontsize=12)
plt.ylabel('Amount of Titles', fontsize=12)
plt.show()

# Other Exploratory Analysis

In [None]:
# Counts the number of occurrences of each actor.
actors_data = netflix_data.withColumn("actor", explode(split("cast", ", ")))
actors_data.groupBy("actor").agg(count("*").alias("count")).orderBy("count", ascending=False).show(10)

# Counts the number of occurrences of each director.
netflix_data.groupBy("director").agg(count("*").alias("count")).orderBy("count", ascending=False).show(10)

In [None]:
# Counts the number of occurrences of NULL in each column.
null_counts = netflix_data.select(
    [sum(col(c).isNull().cast("int")).alias(c) for c in netflix_data.columns]
)

null_counts.show()

# Other Graphs

In [None]:
# Create a bar chart for the distribution of release years.
netflix_pandas['release_year'] = pd.to_numeric(netflix_pandas['release_year'], errors='coerce')
netflix_pandas['release_year'].value_counts().sort_index().plot(kind='bar', figsize=(15, 5), title='Distribution of Release Year')
plt.show()

In [None]:
# Create a bar chart for the distribution of content ratings.
rating_counts = netflix_pandas['rating'].value_counts()
rating_counts.plot(kind='bar', figsize=(10, 6), color='green', edgecolor='black')

plt.title('Distribution of Content Ratings', fontsize=16)
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Amount of Titles', fontsize=12)
plt.show()
