In [7]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("Netflix Data Analysis").getOrCreate()


In [8]:
# Load Dataset
file_location = 'netflix_titles.csv'
netflix_data = spark.read.csv(file_location, header=True, inferSchema=True)


In [9]:
# Dataset Overview
print("1. Dataset Overview:")
netflix_data.printSchema()
print(f"\nTotal Records: {netflix_data.count()}")


1. Dataset Overview:
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)


Total Records: 8809


In [10]:
from pyspark.sql.functions import col, count, desc

# Most Prolific Directors
print("\n2. Most Prolific Directors:")
prolific_directors = netflix_data.groupBy("director") \
    .agg(count("*").alias("total_titles")) \
    .orderBy(desc("total_titles")) \
    .limit(10)
prolific_directors.show()



2. Most Prolific Directors:
+--------------------+------------+
|            director|total_titles|
+--------------------+------------+
|                NULL|        2636|
|       Rajiv Chilaka|          19|
|Raúl Campos, Jan ...|          18|
|        Marcus Raboy|          16|
|         Suhas Kadav|          16|
|           Jay Karas|          14|
| Cathy Garcia-Molina|          13|
|     Youssef Chahine|          12|
|     Martin Scorsese|          12|
|         Jay Chapman|          12|
+--------------------+------------+



In [11]:
from pyspark.sql.functions import avg

# Average Release Year by Content Type
print("\n3. Average Release Year by Content Type:")
avg_year_by_type = netflix_data.groupBy("type") \
    .agg(avg("release_year").alias("average_release_year")) \
    .orderBy("type")
avg_year_by_type.show()



3. Average Release Year by Content Type:
+-------------+--------------------+
|         type|average_release_year|
+-------------+--------------------+
|         NULL|                NULL|
|        Movie|  2013.1326463853452|
|      TV Show|  2016.6071829405162|
|William Wyler|                NULL|
+-------------+--------------------+



In [12]:
from pyspark.sql.functions import split, max, min

# Content Duration Analysis
print("\n4. Content Duration Analysis:")
duration_analysis = netflix_data \
    .withColumn("duration_value", split(col("duration"), " ")[0].cast("int")) \
    .groupBy("type") \
    .agg(avg("duration_value").alias("average_duration"), 
         max("duration_value").alias("maximum_duration"), 
         min("duration_value").alias("minimum_duration"))
duration_analysis.show()



4. Content Duration Analysis:
+-------------+------------------+----------------+----------------+
|         type|  average_duration|maximum_duration|minimum_duration|
+-------------+------------------+----------------+----------------+
|         NULL|              NULL|            NULL|            NULL|
|      TV Show|1.7654320987654322|              17|               1|
|        Movie| 99.88907068062828|            1994|               3|
|William Wyler|              NULL|            NULL|            NULL|
+-------------+------------------+----------------+----------------+



In [13]:
# Countries with Diverse Genres
print("\n5. Countries with Diverse Genres:")
diverse_genres = netflix_data.groupBy("country") \
    .agg(count("listed_in").alias("genres_count")) \
    .orderBy(desc("genres_count")) \
    .limit(10)
diverse_genres.show()



5. Countries with Diverse Genres:
+--------------+------------+
|       country|genres_count|
+--------------+------------+
| United States|        2805|
|         India|         972|
|          NULL|         830|
|United Kingdom|         419|
|         Japan|         245|
|   South Korea|         199|
|        Canada|         181|
|         Spain|         145|
|        France|         123|
|        Mexico|         110|
+--------------+------------+



In [14]:
from pyspark.sql.functions import length

# Longest Titles in Terms of Word Count
print("\n6. Longest Titles in Terms of Word Count:")
longest_titles = netflix_data \
    .withColumn("title_length", length(col("title"))) \
    .orderBy(desc("title_length")) \
    .select("title", "title_length") \
    .limit(10)
longest_titles.show()



6. Longest Titles in Terms of Word Count:
+--------------------+------------+
|               title|title_length|
+--------------------+------------+
|Jim & Andy: The G...|         104|
|Ken Burns Present...|          93|
|Mike Birbiglia: W...|          88|
|The Power of Gray...|          88|
|Steve Martin and ...|          83|
|Cultivating the S...|          79|
|Power Rangers Sam...|          78|
|Willy and the Gua...|          77|
|Ya no estoy aquí:...|          76|
|The Road to El Ca...|          75|
+--------------------+------------+



In [15]:
# Content Rating Distribution
print("\n7. Content Rating Distribution:")
ratings_distribution = netflix_data.groupBy("rating") \
    .agg(count("*").alias("count")) \
    .orderBy(desc("count"))
ratings_distribution.show()



7. Content Rating Distribution:
+-----------------+-----+
|           rating|count|
+-----------------+-----+
|            TV-MA| 3195|
|            TV-14| 2158|
|            TV-PG|  862|
|                R|  796|
|            PG-13|  489|
|            TV-Y7|  334|
|             TV-Y|  307|
|               PG|  286|
|             TV-G|  220|
|               NR|   80|
|                G|   41|
|             NULL|    6|
|         TV-Y7-FV|    6|
|               UR|    3|
|            NC-17|    3|
|             2021|    2|
| November 1, 2020|    1|
| Shavidee Trotter|    1|
|    Adriane Lenox|    1|
|    Maury Chaykin|    1|
+-----------------+-----+
only showing top 20 rows



In [16]:
# Stop Spark Session
spark.stop()
