# Nauman Asif | 2021510
### Cloud and Distributed Computing - CE408
### Assignment # 3

Exploratory Data Analysis (EDA) on Netflix TV Shows and Movies Dataset

### Initializing Spark

In [8]:
import findspark
findspark.init()  # Initialize findspark to locate Spark installation

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan, avg

# Create a Spark session
spark = SparkSession.builder.appName("Netflix EDA").getOrCreate()

### Loading the Dataset

In [9]:
file_path = "titles.csv"  # Dataset's path
netflix_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema
netflix_df.printSchema()

# Show a sample of the data
netflix_df.show(5)

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- age_certification: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- seasons: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- imdb_score: string (nullable = true)
 |-- imdb_votes: string (nullable = true)
 |-- tmdb_popularity: string (nullable = true)
 |-- tmdb_score: string (nullable = true)

+--------+--------------------+-----+--------------------+------------+-----------------+-------+--------------------+--------------------+-------+---------+----------+----------+---------------+----------+
|      id|               title| type|         description|release_year|age_certification|runtime|              genres|production_countries|seasons|  imdb_id|imdb_sc

### Counting the Missing Values

In [10]:
missing_values = netflix_df.select([
    count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in netflix_df.columns
])
missing_values.show()

+---+-----+----+-----------+------------+-----------------+-------+------+--------------------+-------+-------+----------+----------+---------------+----------+
| id|title|type|description|release_year|age_certification|runtime|genres|production_countries|seasons|imdb_id|imdb_score|imdb_votes|tmdb_popularity|tmdb_score|
+---+-----+----+-----------+------------+-----------------+-------+------+--------------------+-------+-------+----------+----------+---------------+----------+
|  0|    3|  18|         40|          70|             2594|    121|    97|                  96|   3684|    541|       575|       600|            223|       418|
+---+-----+----+-----------+------------+-----------------+-------+------+--------------------+-------+-------+----------+----------+---------------+----------+



### Summary Statistics

In [11]:
netflix_df.describe().show()

+-------+--------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+
|summary|                  id|             title|              type|         description|        release_year|   age_certification|             runtime|              genres|production_countries|             seasons|             imdb_id|       imdb_score|          imdb_votes|     tmdb_popularity|        tmdb_score|
+-------+--------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------------+
|  count|                5929|              5926|   

### Distribution Analysis

In [12]:
# Count unique values in categorical columns
for column in ["type", "imdb_score", "release_year"]:
    netflix_df.groupBy(column).count().orderBy(col("count").desc()).show(5)

+--------------------+-----+
|                type|count|
+--------------------+-----+
|               MOVIE| 3744|
|                SHOW| 2106|
|                NULL|   18|
|               TV-14|    5|
| every Saturday a...|    3|
+--------------------+-----+
only showing top 5 rows

+----------+-----+
|imdb_score|count|
+----------+-----+
|      NULL|  575|
|       7.1|  197|
|       6.5|  194|
|       7.4|  191|
|       6.7|  188|
+----------+-----+
only showing top 5 rows

+------------+-----+
|release_year|count|
+------------+-----+
|        2019|  807|
|        2020|  792|
|        2021|  761|
|        2018|  740|
|        2017|  541|
+------------+-----+
only showing top 5 rows



### Filtering Movies and TV Shows

In [13]:
movies_df = netflix_df.filter(col("type") == "MOVIE")
tv_shows_df = netflix_df.filter(col("type") == "SHOW")

print(f"Total Movies: {movies_df.count()}")
print(f"Total TV Shows: {tv_shows_df.count()}")

Total Movies: 3744
Total TV Shows: 2106


### Top 5 genres

In [14]:
print("\nTop 5 Movie Genres:")
movies_df.groupBy("genres").count().orderBy(col("count").desc()).show(5)

print("\nTop 5 TV Shows Genres:")
tv_shows_df.groupBy("genres").count().orderBy(col("count").desc()).show(5)


Top 5 Movie Genres:
+--------------------+-----+
|              genres|count|
+--------------------+-----+
|          ['comedy']|  356|
|   ['documentation']|  220|
|           ['drama']|  216|
|['comedy', 'docum...|   87|
| ['comedy', 'drama']|   82|
+--------------------+-----+
only showing top 5 rows


Top 5 TV Shows Genres:
+-------------------+-----+
|             genres|count|
+-------------------+-----+
|        ['reality']|  107|
|          ['drama']|   96|
|  ['documentation']|   96|
|         ['comedy']|   93|
|['comedy', 'drama']|   50|
+-------------------+-----+
only showing top 5 rows



### Stopping Spark Session

In [15]:
spark.stop()