In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, udf, struct
from pyspark.sql.types import BooleanType

# Initialize Spark Session
spark = SparkSession.builder.appName("YouTubeDataCleaning").getOrCreate()

# Directory containing the CSV files
data_dir = "./data"

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

# Function to check if a string is mostly English
def is_mostly_english(text):
    if not text:  # Handle empty strings
        return False
    try:
        text.encode('utf-8').decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

# Register the UDF
is_mostly_english_udf = udf(is_mostly_english, BooleanType())

# Function to check for rows with mostly empty fields
def is_mostly_empty(row):
    non_empty_count = 0
    for value in row:
        if value:
            non_empty_count += 1
    return non_empty_count < 3  # Adjust threshold as needed

# Loop through each CSV file
for csv_file in csv_files:
    file_path = os.path.join(data_dir, csv_file)

    # Read the CSV file into a DataFrame
    try:
        df = spark.read.csv(file_path, header=True, inferSchema=True)
    except Exception as e:
        print(f"Error reading file {csv_file}: {e}")
        continue  # Skip to the next file if there's an error

    # Remove non-English rows (Apply the UDF to relevant columns)
    columns_to_check = ['tags']  # Only check 'tags' for English
    for column in columns_to_check:
        if column in df.columns:
            df = df.filter(is_mostly_english_udf(col(column)))

    # Remove mostly empty rows
    row_values = [df[field] for field in df.columns]
    df = df.filter(~udf(lambda row: is_mostly_empty(row), BooleanType())(struct(*row_values)))

    # Explode the tags column
    df = df.withColumn("tag", explode(split(col("tags"), "\|")))

    # Columns to drop
    columns_to_drop = ['video_id', 'thumbnail_link', 'description', 'title', 'channel_title', 'tags']
    columns_to_drop = [col_name for col_name in columns_to_drop if col_name in df.columns]
    df = df.drop(*columns_to_drop)

    # Register the processed dataframe
    df.createOrReplaceTempView("temp_table")

    if 'all_data' in locals():
        all_data = spark.sql("SELECT * FROM all_data UNION ALL SELECT * FROM temp_table")
    else:
        all_data = spark.sql("SELECT * FROM temp_table")

# Remove the temporary table
spark.catalog.dropTempView("temp_table")

if 'all_data' in locals():
    all_data.show()
    # Save the final DataFrame to a single CSV file
    all_data.coalesce(1).write.mode("overwrite").option("header", "true").csv(
        "cleaned_youtube_data.csv")
    print("Data saved to cleaned_youtube_data.csv")
else:
    print("No CSV files were processed.")

# Stop the Spark session
spark.stop()


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
SLF4J: Failed to load class "org.slf4j.impl.StaticMDCBinder".
SLF4J: Defaulting to no-operation MDCAdapter implementation.
SLF4J: See http://www.slf4j.org/codes.html#no_static_mdc_binder for further details.
                                                                                

+-------------+-----------+--------------------+--------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+
|trending_date|category_id|        publish_time|   views| likes|dislikes|comment_count|comments_disabled|ratings_disabled|video_error_or_removed|                 tag|
+-------------+-----------+--------------------+--------+------+--------+-------------+-----------------+----------------+----------------------+--------------------+
|     17.14.11|         10|2017-11-10T17:00:...|17158579|787425|   43420|       125882|            False|           False|                 False|            "Eminem"|
|     17.14.11|         10|2017-11-10T17:00:...|17158579|787425|   43420|       125882|            False|           False|                 False|              "Walk"|
|     17.14.11|         10|2017-11-10T17:00:...|17158579|787425|   43420|       125882|            False|           False|                 False|                "On"

                                                                                

Data saved to cleaned_youtube_data.csv
