In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import re

# Create a Spark session
spark = SparkSession.builder \
    .appName("Anime Dataset Cleaning") \
    .getOrCreate()

# Define the file path
filepath = "project/anime-dataset-2023.csv"

# Read the CSV file as an RDD of strings
rdd = spark.sparkContext.textFile(filepath)

header = rdd.first()  # Get the header to understand column names
data_rdd = rdd.filter(lambda line: line != header)  # Filter out the header row

def clean_and_parse(lines):
    cleaned_lines = []
    current_line = ""

    for line in lines:
        # Check if the line starts with a digit (indicating a new record)
        if re.match(r'^\d+,.+', line):
            # If there's a current line, save it to the list before starting a new record
            if current_line:
                cleaned_lines.append(current_line)
            current_line = line  # Start a new record
        else:
            # This line is a continuation of the previous record
            current_line += " " + line.strip()  # Merge with a space

    #Add the last processed line
    if current_line:
        cleaned_lines.append(current_line)

    return cleaned_lines

# Clean and parse the RDD lines
cleaned_rdd = data_rdd.mapPartitions(clean_and_parse)

# Function to parse each line
def parse_line(line):
    # Split the line by commas, handling quotes
    fields = re.split(r',(?=(?:[^"]|"[^"]*")*[^"]*$)', line.strip())
    
    # Handle cases with missing fields by padding with None
    while len(fields) < 24:
        fields.append(None)
    
    return Row(*fields)

# Parse each cleaned line into a Row object
parsed_rdd = cleaned_rdd.map(parse_line)

# Create DataFrame from parsed RDD
columns = header.split(",")  # Split the header to get column names
anime_df = spark.createDataFrame(parsed_rdd, schema=columns)

# Show the schema of the DataFrame
#anime_df.printSchema()

# Display a sample of the DataFrame
anime_df.show(30, truncate=False)  # Show first 10 rows without truncation

# Define the output path in Hadoop
output_path = "project/anime_dataset_cleaned.csv"

# Save the DataFrame as a CSV to Hadoop
anime_df.write.option("quote", '"') \
    .option("escape", '"') \
    .csv(output_path, header=True, mode='overwrite')

# Stop the Spark session
spark.stop()


                                                                                

+--------+--------------------------------------------------------------------+----------------------------------------------+--------------------------------------------------+-----+-------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [14]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Read Anime Dataset from Hadoop") \
    .getOrCreate()

# Define the path to the CSV file in HDFS
filepath = "project/anime_dataset_cleaned.csv"
# filepath = "project/anime-filtered.csv"

# Read the CSV file into a DataFrame
anime_df = spark.read.csv(filepath, header=True, inferSchema=True)

# Show the schema of the DataFrame
#anime_df.printSchema()

# Display a sample of the DataFrame
anime_df.show(21, truncate=False)  # Show first 10 rows without truncation

# Stop the Spark session
spark.stop()


                                                                                

+--------+-------------------------------+-----------------------+-------------------------------------------+-----+------------+-------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------

In [15]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Anime Dataset Cleaning") \
    .getOrCreate()

# Define the file path
filepath = "project/anime_dataset_cleaned.csv"

# Read the CSV file as an RDD of strings
rdd = spark.sparkContext.textFile(filepath)

header = rdd.first()  # Get the header to understand column names
data_rdd = rdd.filter(lambda line: line != header)  # Filter out the header row

# Collect and print the first 10 lines of data_rdd
first_10_lines = data_rdd.take(21)
for line in first_10_lines:
    print(line)

# Stop the Spark session
spark.stop()

                                                                                

1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"""Action, Award Winning, Sci-Fi""","""Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.  Spike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.  While developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted by a menace from Spike's past. As a rival's maniacal plot continues to unravel, 

In [30]:
from pyspark.sql import SparkSession
from io import StringIO
import csv
import re

# Create a Spark session
spark = SparkSession.builder \
    .appName("Anime Dataset Trimming") \
    .getOrCreate()


# Define the file path
filepath = "project/anime_dataset_cleaned.csv"
# Read the CSV file as an RDD of strings
rdd = spark.sparkContext.textFile(filepath)



header = rdd.first()

def process_line(line):

    reader = csv.reader(StringIO(line))
    fields = next(reader)
    # Strip triple quotes from each field
    fields = [field.strip('"""') for field in fields]
    return fields
   
# Filter out the header and process the lines
processed_rdd = rdd.filter(lambda line: line != header) \
    .map(process_line) \
    .map(lambda row: (
        row[0],                  
        row[4],                  
        row[5].replace(',', ';'),                  
        row[7].replace(',', ';'),                  
        row[8],                  
        row[14].replace(',', ';'),                 
        row[18],                 
        row[19],                    
        row[20],                    
        row[21],                    
        row[22]                     
    ))

first_10_lines = processed_rdd.take(21)
for line in first_10_lines:
    print(line)

# Define schema for the selected columns

schema = ["anime_id", "score", "genre", "type", "episodes", "studios",  "rank", "popularity", "favorites", "scored_by", "members"]

# Create a DataFrame from the processed RDD
anime_df = spark.createDataFrame(processed_rdd, schema=schema)

# Show the resulting DataFrame
anime_df.show(truncate=False)

# Define the output path (as a directory, no .csv extension)
output_path = "project/anime_dataset_trimmed.csv"

# Save the DataFrame as a CSV file with overwrite mode
anime_df.write.csv(output_path, header=True, mode= 'overwrite')

spark.stop()


                                                                                

('1', '8.75', 'Action; Award Winning; Sci-Fi', 'TV', '26.0', 'Sunrise', '41.0', '43', '78525', '914193.0', '1771505')
('5', '8.38', 'Action; Sci-Fi', 'Movie', '1.0', 'Bones', '189.0', '602', '1448', '206248.0', '360978')
('6', '8.22', 'Action; Adventure; Sci-Fi', 'TV', '26.0', 'Madhouse', '328.0', '246', '15035', '356739.0', '727252')
('7', '7.25', 'Action; Drama; Mystery; Supernatural', 'TV', '26.0', 'Sunrise', '2764.0', '1795', '613', '42829.0', '111931')
('8', '6.94', 'Adventure; Fantasy; Supernatural', 'TV', '52.0', 'Toei Animation', '4240.0', '5126', '14', '6413.0', '15001')
('15', '7.92', 'Sports', 'TV', '145.0', 'Gallop', '688.0', '1252', '1997', '86524.0', '177688')
('16', '8.0', 'Comedy; Drama; Romance', 'TV', '24.0', 'J.C.Staff', '589.0', '862', '4136', '81747.0', '260166')
('17', '7.55', 'Comedy; Slice of Life; Sports', 'TV', '52.0', 'Nippon Animation', '1551.0', '4212', '237', '12960.0', '24172')
('18', '8.16', 'Action; Drama', 'TV', '24.0', 'A.C.G.T.', '393.0', '1273', '12

                                                                                

+--------+-----+----------------------------------------+-----+--------+----------------+------+----------+---------+---------+-------+
|anime_id|score|genre                                   |type |episodes|studios         |rank  |popularity|favorites|scored_by|members|
+--------+-----+----------------------------------------+-----+--------+----------------+------+----------+---------+---------+-------+
|1       |8.75 |Action; Award Winning; Sci-Fi           |TV   |26.0    |Sunrise         |41.0  |43        |78525    |914193.0 |1771505|
|5       |8.38 |Action; Sci-Fi                          |Movie|1.0     |Bones           |189.0 |602       |1448     |206248.0 |360978 |
|6       |8.22 |Action; Adventure; Sci-Fi               |TV   |26.0    |Madhouse        |328.0 |246       |15035    |356739.0 |727252 |
|7       |7.25 |Action; Drama; Mystery; Supernatural    |TV   |26.0    |Sunrise         |2764.0|1795      |613      |42829.0  |111931 |
|8       |6.94 |Adventure; Fantasy; Supernatural

                                                                                

In [7]:
spark = SparkSession.builder \
    .appName("Read Trimmed Anime Dataset") \
    .getOrCreate()

filepath = "project/anime_dataset_trimmed.csv"

anime_df = spark.read.csv(filepath, header=True, inferSchema=True)

# Show the schema of the DataFrame
#anime_df.printSchema()

# Display a sample of the DataFrame
anime_df.show(15, truncate=False)  # Show first 10 rows without truncation

# Stop the Spark session
spark.stop()


                                                                                

+--------+-------+----------------------+-------+--------+------------------------+-------+----------+---------+---------+-------+
|anime_id|score  |genre                 |type   |episodes|studios                 |rank   |popularity|favorites|scored_by|members|
+--------+-------+----------------------+-------+--------+------------------------+-------+----------+---------+---------+-------+
|30228   |UNKNOWN|Action; Sci-Fi        |Movie  |1.0     |UNKNOWN                 |15180.0|17728     |0        |UNKNOWN  |266    |
|30230   |8.3    |Sports                |TV     |51.0    |Production I.G, Madhouse|241.0  |1570      |1054     |73353.0  |132275 |
|30232   |UNKNOWN|Comedy; Fantasy       |TV     |26.0    |OLM                     |12788.0|14428     |0        |UNKNOWN  |573    |
|30234   |5.51   |UNKNOWN               |Music  |1.0     |UNKNOWN                 |UNKNOWN|11407     |0        |913.0    |1520   |
|30235   |UNKNOWN|Comedy                |OVA    |1.0     |UNKNOWN                 |

In [35]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("Read Trimmed Anime Dataset from Hadoop") \
    .getOrCreate()

# Define the path to the CSV file in HDFS
filepath = "project/anime_dataset_trimmed.csv"
# filepath = "project/anime-filtered.csv"
rdd = spark.sparkContext.textFile(filepath)

# Read the CSV file into a DataFrame
first_10_lines = rdd.take(10)
for line in first_10_lines:
    print(line)
# Stop the Spark session
spark.stop()

                                                                                

anime_id,score,genre,type,episodes,studios,rank,popularity,favorites,scored_by,members
1,8.75,Action; Award Winning; Sci-Fi,TV,26.0,Sunrise,41.0,43,78525,914193.0,1771505
5,8.38,Action; Sci-Fi,Movie,1.0,Bones,189.0,602,1448,206248.0,360978
6,8.22,Action; Adventure; Sci-Fi,TV,26.0,Madhouse,328.0,246,15035,356739.0,727252
7,7.25,Action; Drama; Mystery; Supernatural,TV,26.0,Sunrise,2764.0,1795,613,42829.0,111931
8,6.94,Adventure; Fantasy; Supernatural,TV,52.0,Toei Animation,4240.0,5126,14,6413.0,15001
15,7.92,Sports,TV,145.0,Gallop,688.0,1252,1997,86524.0,177688
16,8.0,Comedy; Drama; Romance,TV,24.0,J.C.Staff,589.0,862,4136,81747.0,260166
17,7.55,Comedy; Slice of Life; Sports,TV,52.0,Nippon Animation,1551.0,4212,237,12960.0,24172
18,8.16,Action; Drama,TV,24.0,A.C.G.T.,393.0,1273,1237,97878.0,173710


In [36]:
if 'spark' in locals() or 'spark' in globals():
    print("Stopping existing Spark session...")
    spark.stop()  # Stop the existing Spark session

Stopping existing Spark session...


In [10]:
from pyspark.sql import SparkSession
from io import StringIO
import csv
import re

# Create a Spark session
spark = SparkSession.builder \
    .appName("Anime Dataset Trimming") \
    .getOrCreate()


# Define the file path
filepath = "project/anime_dataset_cleaned.csv"
# Read the CSV file as an RDD of strings
rdd = spark.sparkContext.textFile(filepath)



header = rdd.first()

def process_line(line):
    try:
        reader = csv.reader(StringIO(line))
        fields = next(reader)

        # Check if the row has the expected number of columns
        if len(fields) != len(header.split(",")):
            # Attempt to fix misaligned rows by joining extra columns or filling missing ones
            fields = fields[:len(header.split(","))] + [''] * max(0, len(header.split(",")) - len(fields))

        # Strip triple quotes and clean fields
        fields = [field.strip('"""') for field in fields]

        return fields
    except Exception as e:
        print(f"Error processing line: {line} -> {e}")
        return None  # Skip problematic rows

   
# Filter out the header and process the lines
processed_rdd = rdd.filter(lambda line: line != header) \
    .map(process_line) \
    .filter(lambda row: row is not None and len(row) == len(header.split(","))) \
    .map(lambda row: (
        row[0],                  
        row[4],                  
        row[5].replace(',', ';'),                  
        row[7],                  
        row[8],                  
        row[14],                 
        row[18],                 
        row[19],                    
        row[20],                    
        row[21],                    
        row[22]                     
    ))

# first_10_lines = processed_rdd.take(10)
# for line in first_10_lines:
#     print(line)



# Define schema for the selected columns

schema = ["anime_id", "score", "genre", "type", "episodes", "studios",  "rank", "popularity", "favorites", "scored_by", "members"]

# Create a DataFrame from the processed RDD
anime_df = spark.createDataFrame(processed_rdd, schema=schema)

# Show the resulting DataFrame
anime_df.show(truncate=False)

# Define the output path (as a directory, no .csv extension)
output_path = "project/anime_dataset_trimmed.csv"

# # Save the DataFrame as a CSV file with overwrite mode
anime_df.write.csv(output_path, header=True, mode= 'overwrite')

spark.stop()


                                                                                

+--------+-----+----------------------------------------+-----+--------+----------------+------+----------+---------+---------+-------+
|anime_id|score|genre                                   |type |episodes|studios         |rank  |popularity|favorites|scored_by|members|
+--------+-----+----------------------------------------+-----+--------+----------------+------+----------+---------+---------+-------+
|1       |8.75 |Action; Award Winning; Sci-Fi           |TV   |26.0    |Sunrise         |41.0  |43        |78525    |914193.0 |1771505|
|5       |8.38 |Action; Sci-Fi                          |Movie|1.0     |Bones           |189.0 |602       |1448     |206248.0 |360978 |
|6       |8.22 |Action; Adventure; Sci-Fi               |TV   |26.0    |Madhouse        |328.0 |246       |15035    |356739.0 |727252 |
|7       |7.25 |Action; Drama; Mystery; Supernatural    |TV   |26.0    |Sunrise         |2764.0|1795      |613      |42829.0  |111931 |
|8       |6.94 |Adventure; Fantasy; Supernatural

                                                                                