In [1]:
import re
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("SaveRDDAsDataFrameCSV") \
    .getOrCreate()

# Load the CSV file as an RDD
filepath = "project/users-details-2023.csv"
anime_rdd = spark.sparkContext.textFile(filepath)

# Remove header
header = anime_rdd.first()
data_rdd = anime_rdd.filter(lambda line: line != header)

def fill_empty_fields(line):
        pattern = r',(?=(?:[^"]*"[^"]*")*[^"]*$)'
        fields = re.split(pattern, line.strip())
        # Clean up quotes and replace empty fields with "unknown"
        fields = [
                field.strip().replace('"', '') if field.strip() != "" else "unknown" 
                for field in fields
        ]
        return Row(*fields)

# Apply the function to each line and convert to Rows
filled_rdd = data_rdd.map(fill_empty_fields)


# Define the schema for the DataFrame based on your CSV header
schema = ["Mal ID", "Username", "Gender", "Birthday", "Location", "Joined", 
          "Days Watched", "Mean Score", "Watching", "Completed", 
          "On Hold", "Dropped", "Plan to Watch", "Total Entries", 
          "Rewatched", "Episodes Watched"]

# Create a DataFrame from the filled RDD
anime_df = spark.createDataFrame(filled_rdd, schema=schema)
anime_df.show(truncate= False)

# Define the output path (as a directory, no .csv extension)
output_path = "project/users-details-2023-filled"

# Save the DataFrame as a CSV file with overwrite mode
anime_df.write.mode("overwrite").csv(output_path, header=True)


spark.stop()




                                                                                

+------+---------------+-------+-------------------------+--------------------------------+-------------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|Mal ID|Username       |Gender |Birthday                 |Location                        |Joined                   |Days Watched|Mean Score|Watching|Completed|On Hold|Dropped|Plan to Watch|Total Entries|Rewatched|Episodes Watched|
+------+---------------+-------+-------------------------+--------------------------------+-------------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|1     |Xinil          |Male   |1985-03-04T00:00:00+00:00|California                      |2004-11-05T00:00:00+00:00|142.3       |7.37      |1.0     |233.0    |8.0    |93.0   |64.0         |399.0        |60.0     |8458.0          |
|3     |Aokaado        |Male   |unknown                  |Oslo, Norway  

                                                                                

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SaveRDDAsDataFrameCSV").getOrCreate()

filepath = "project/users-details-2023-filled"

df = spark.read.csv(filepath, header = True)

df.show(truncate=False)

spark.stop()

                                                                                

+------+--------------+-------+-------------------------+-----------------------+-------------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|Mal ID|Username      |Gender |Birthday                 |Location               |Joined                   |Days Watched|Mean Score|Watching|Completed|On Hold|Dropped|Plan to Watch|Total Entries|Rewatched|Episodes Watched|
+------+--------------+-------+-------------------------+-----------------------+-------------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|411534|RazielD       |Male   |unknown                  |unknown                |2010-11-23T00:00:00+00:00|19.0        |7.5       |2.0     |12.0     |0.0    |0.0    |0.0          |14.0         |10.0     |1117.0          |
|411535|FVAnime       |unknown|unknown                  |unknown                |2010-11-23T00:00:00+00:00|10.7 

In [5]:
if SparkContext._active_spark_context:
    SparkContext._active_spark_context.stop()

NameError: name 'SparkContext' is not defined