In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import json
from pyspark.sql.functions import explode, split, col
import os


In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("YouTubeDataCleaning").getOrCreate()

data_dir = "./data"

json_files = [f for f in os.listdir(data_dir) if f.endswith(".json")]

dfs = []

# Loop through each JSON file
for json_file in json_files:
    file_path = os.path.join(data_dir, json_file)

    # Load JSON Data
    try:
        with open(file_path, 'r') as f:  # Open in text mode for encoding
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {json_file}: {e}")
        continue  # Skip to the next file on error
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        continue

    # Extract only 'items' array
    items = data.get("items", [])

    # Convert to DataFrame
    df = spark.createDataFrame(items)


    df.head(10)

    # Select relevant fields
    df_cleaned = df.select(
        col("id").alias("category_id"),
        col("snippet.title").alias("category_title")
    ).where(col("snippet.assignable") == True)

    # Split category_title into multiple words and explode
    df_exploded = df_cleaned.withColumn("category_title", explode(split(col("category_title"), " ")))

    # Remove rows where category_title is '&'
    df_exploded = df_exploded.filter(col("category_title") != "&")

    # Append the processed DataFrame to the list
    dfs.append(df_exploded)

# Union all DataFrames
if dfs:
    final_df = dfs[0]
    for df in dfs[1:]:
        final_df = final_df.union(df)

    # Show the final DataFrame (optional)
    final_df.show()
    final_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("./youtube_categories.json")
    print("Data saved to ./youtube_categories.json")



else:
    print("No JSON files processed.")

spark.stop()


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
SLF4J: Failed to load class "org.slf4j.impl.StaticMDCBinder".
SLF4J: Defaulting to no-operation MDCAdapter implementation.
SLF4J: See http://www.slf4j.org/codes.html#no_static_mdc_binder for further details.
                                                                                

+-----------+--------------+
|category_id|category_title|
+-----------+--------------+
|          1|          Film|
|          1|     Animation|
|          2|         Autos|
|          2|      Vehicles|
|         10|         Music|
|         15|          Pets|
|         15|       Animals|
|         17|        Sports|
|         19|        Travel|
|         19|        Events|
|         20|        Gaming|
|         22|        People|
|         22|         Blogs|
|         23|        Comedy|
|         24| Entertainment|
|         25|          News|
|         25|      Politics|
|         26|         Howto|
|         26|         Style|
|         27|     Education|
+-----------+--------------+
only showing top 20 rows



                                                                                

Data saved to ./youtube_categories.json
