In [12]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, avg, max, min, split, length, collect_list

In [13]:
# Initialize Spark session
def initialize_spark():
    return SparkSession.builder \
        .appName("Netflix Data Insights") \
        .getOrCreate()

In [14]:
# Load dataset function
def load_data(session, file_path):
    return session.read.csv(file_path, header=True, inferSchema=True)

In [15]:
# Perform basic exploration
def explore_data(dataframe):
    print("Dataset Overview:\n")
    dataframe.printSchema()
    print(f"Total Records: {dataframe.count()}\n")
    dataframe.show(5, truncate=False)

In [16]:
# Display data summary
def data_summary(dataframe):
    print("Data Summary:\n")
    dataframe.describe().show()

In [17]:
# Count records by category
def count_by_category(dataframe, column_name):
    print(f"Record Count by {column_name}:\n")
    dataframe.groupBy(column_name).count().orderBy(desc("count")).show()

In [19]:
# Main function
if __name__ == "__main__":
    spark = initialize_spark()
    data_file = "netflix_titles.csv"
    netflix_df = load_data(spark, data_file)
    
    # Perform initial data exploration
    explore_data(netflix_df)
    
    # Show summary statistics
    data_summary(netflix_df)
    
    # Count shows by type
    count_by_category(netflix_df, "type")

Dataset Overview:

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

Total Records: 8809

+-------+-------+---------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+------------------+------------+------+---------+---------------------------------------------------------