In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, udf, struct
from pyspark.sql.types import BooleanType, IntegerType, StringType

# Initialize Spark Session
spark = SparkSession.builder.appName("YouTubeDataCleaning").getOrCreate()

# Directory containing the CSV files
data_dir = "./data"

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

# Function to check if a string is mostly English
def is_mostly_english(text):
    if not text:  # Handle empty strings
        return False
    try:
        text.encode('utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

# Register the UDF
is_mostly_english_udf = udf(is_mostly_english, BooleanType())

# Function to check for rows with mostly empty fields
def is_mostly_empty(row):
    non_empty_count = sum(1 for value in row if value)
    return non_empty_count < 3  # Adjust threshold as needed

# Initialize an empty DataFrame for `all_data`
all_data = None

# Loop through each CSV file
for csv_file in csv_files:
    file_path = os.path.join(data_dir, csv_file)

    # Read the CSV file into a DataFrame
    try:
        df = spark.read.csv(file_path, header=True, inferSchema=True)
    except Exception as e:
        print(f"Error reading file {csv_file}: {e}")
        continue  # Skip to the next file if there's an error

    # Remove non-English rows (Apply the UDF to relevant columns)
    if "tags" in df.columns:
        df = df.filter(is_mostly_english_udf(col("tags")))

    # Remove mostly empty rows
    row_values = [df[field] for field in df.columns]
    df = df.filter(~udf(is_mostly_empty, BooleanType())(struct(*row_values)))

    # Explode the tags column
    if "tags" in df.columns:
        df = df.withColumn("tag", explode(split(col("tags"), r"\\|")))

    # Drop unnecessary columns
    columns_to_drop = ['video_id', 'thumbnail_link', 'description', 'title', 'channel_title', 'tags']
    df = df.drop(*[col_name for col_name in columns_to_drop if col_name in df.columns])

    # Ensure consistent column data types
    df = df.withColumn("comment_count", col("comment_count").cast(IntegerType()))
    df = df.withColumn("comments_disabled", col("comments_disabled").cast(BooleanType()))
    df = df.withColumn("ratings_disabled", col("ratings_disabled").cast(BooleanType()))
    df = df.withColumn("video_error_or_removed", col("video_error_or_removed").cast(BooleanType()))

    # Merge into `all_data`
    if all_data is None:
        all_data = df
    else:
        all_data = all_data.unionByName(df, allowMissingColumns=True)  # Ensure matching column names

# If no data was processed, print a message and exit
if all_data is None:
    print("No CSV files were processed.")
else:
    all_data.show(truncate=False)
    # Save the final DataFrame to a single CSV file
    all_data.coalesce(1).write.mode("overwrite").option("header", "true").csv("cleaned_youtube_data.csv")
    print("Data saved to cleaned_youtube_data.csv")

# Stop the Spark session
spark.stop()


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
SLF4J: Failed to load class "org.slf4j.impl.StaticMDCBinder".
SLF4J: Defaulting to no-operation MDCAdapter implementation.
SLF4J: See http://www.slf4j.org/codes.html#no_static_mdc_binder for further details.
                                                                                

+-------------+-----------+------------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+---+
|trending_date|category_id|publish_time            |views  |likes |dislikes|comment_count|comments_disabled|ratings_disabled|video_error_or_removed|tag|
+-------------+-----------+------------------------+-------+------+--------+-------------+-----------------+----------------+----------------------+---+
|18.07.02     |1          |2018-02-06T04:01:56.000Z|90929  |442   |88      |174          |false            |false           |false                 |[  |
|18.07.02     |1          |2018-02-06T04:01:56.000Z|90929  |442   |88      |174          |false            |false           |false                 |n  |
|18.07.02     |1          |2018-02-06T04:01:56.000Z|90929  |442   |88      |174          |false            |false           |false                 |o  |
|18.07.02     |1          |2018-02-06T04:01:56.000Z|90929  |442   |88      |174   

                                                                                

Data saved to cleaned_youtube_data.csv
