In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Create a Spark session
spark = SparkSession.builder \
    .appName("Userscore Dataset Cleaning") \
    .getOrCreate()

# Define the file path
filepath = "project/users-score-2023.csv"

# Read the CSV file as an RDD of strings
rdd = spark.sparkContext.textFile(filepath)

header = rdd.first()


# Filter out the header and map the columns
selected_columns_rdd = rdd.filter(lambda line: line != header) \
    .map(lambda line: line.split(',')) \
    .map(lambda row: (row[0], row[2], row[4]))

schema = ["user_id","anime_id","rating"]

# Create a DataFrame from the filled RDD
users_df = spark.createDataFrame(selected_columns_rdd, schema=schema)
users_df.show(truncate= False)



                                                                                

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|1      |21      |9     |
|1      |48      |7     |
|1      |320     |5     |
|1      |49      |8     |
|1      |304     |8     |
|1      |306     |8     |
|1      |53      |7     |
|1      |47      |5     |
|1      |591     |6     |
|1      |54      |7     |
|1      |55      |5     |
|1      |56      |6     |
|1      |57      |9     |
|1      |368     |5     |
|1      |68      |7     |
|1      |889     |9     |
|1      |1519    |7     |
|1      |58      |8     |
|1      |1222    |7     |
|1      |458     |4     |
+-------+--------+------+
only showing top 20 rows



In [None]:
output_path = "project/users-score-2023-trimmed"

# Save the DataFrame as a CSV file with overwrite mode
users_df.write.mode("overwrite").csv(output_path, header=True)

In [91]:

spark = SparkSession.builder \
    .appName("Userscore Dataset Cleaning") \
    .getOrCreate()

filepath = "project/anime_dataset_trimmed.csv"
rdd = spark.sparkContext.textFile(filepath)

for line in rdd.take(30):
    print(line)

anime_id,score,genre,type,episodes,studios,rank,popularity,favorites,scored_by,members
1,8.75,Action; Award Winning; Sci-Fi,TV,26.0,Sunrise,41.0,43,78525,914193.0,1771505
5,8.38,Action; Sci-Fi,Movie,1.0,Bones,189.0,602,1448,206248.0,360978
6,8.22,Action; Adventure; Sci-Fi,TV,26.0,Madhouse,328.0,246,15035,356739.0,727252
7,7.25,Action; Drama; Mystery; Supernatural,TV,26.0,Sunrise,2764.0,1795,613,42829.0,111931
8,6.94,Adventure; Fantasy; Supernatural,TV,52.0,Toei Animation,4240.0,5126,14,6413.0,15001
15,7.92,Sports,TV,145.0,Gallop,688.0,1252,1997,86524.0,177688
16,8.0,Comedy; Drama; Romance,TV,24.0,J.C.Staff,589.0,862,4136,81747.0,260166
17,7.55,Comedy; Slice of Life; Sports,TV,52.0,Nippon Animation,1551.0,4212,237,12960.0,24172
18,8.16,Action; Drama,TV,24.0,A.C.G.T.,393.0,1273,1237,97878.0,173710
19,8.87,Drama; Mystery; Suspense,TV,74.0,Madhouse,26.0,142,47235,368569.0,1013100
20,7.99,Action; Adventure; Fantasy,TV,220.0,Pierrot,599.0,8,76343,1883772.0,2717330
21,8.69,Action; Adventure; 

                                                                                

In [31]:
%reset

In [146]:
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import Vectors

spark = SparkSession.builder \
    .appName("MLlib with RDDs") \
    .getOrCreate()

# Load CSV file as RDD
filepath = "project/anime_dataset_trimmed.csv"
rdd = spark.sparkContext.textFile(filepath)


# Extract the header
header = rdd.first()

# remove unknown and empty space ""
filtered_rdd = rdd.filter(lambda line: 'UNKNOWN' not in line and '""' not in line) 

data_rdd = filtered_rdd.filter(lambda line: line != header)

def parse_line(line):
    fields = line.split(',')
    
    label = float(fields[1])  # score
    features = [
        #fields[4],   # episodes
        float(fields[6]),   # rank
        float(fields[7]),   # popularity
        float(fields[8]),   # favorites
        float(fields[9]),  # scored_by
        float(fields[10])   # members
    ]
    #return (label, features)
    return LabeledPoint(label, Vectors.dense(features))

labeled_rdd = data_rdd.map(parse_line)

for line in labeled_rdd.take(10):
    print(line)

(8.75,[41.0,43.0,78525.0,914193.0,1771505.0])
(8.38,[189.0,602.0,1448.0,206248.0,360978.0])
(8.22,[328.0,246.0,15035.0,356739.0,727252.0])
(7.25,[2764.0,1795.0,613.0,42829.0,111931.0])
(6.94,[4240.0,5126.0,14.0,6413.0,15001.0])
(7.92,[688.0,1252.0,1997.0,86524.0,177688.0])
(8.0,[589.0,862.0,4136.0,81747.0,260166.0])
(7.55,[1551.0,4212.0,237.0,12960.0,24172.0])
(8.16,[393.0,1273.0,1237.0,97878.0,173710.0])
(8.87,[26.0,142.0,47235.0,368569.0,1013100.0])


In [None]:
# Split data into training and testing sets
train_rdd, test_rdd = labeled_rdd.randomSplit([0.8, 0.2], seed=42)

# Train the Linear Regression model using Stochastic Gradient Descent (SGD)
model = LinearRegressionWithSGD.train(train_rdd, iterations=10, step=0.01)

In [152]:
predictions_and_labels = test_rdd.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Evaluate the model using regression metrics
metrics = RegressionMetrics(predictions_and_labels)

# Print evaluation metrics
print(f"Root Mean Squared Error (RMSE): {metrics.rootMeanSquaredError}")
print(f"Mean Absolute Error (MAE): {metrics.meanAbsoluteError}")
print(f"R-squared: {metrics.r2}")


for prediction, label in predictions_and_labels.take(10):
    print(f"Predicted: {prediction}, Actual: {label}")

Root Mean Squared Error (RMSE): 2.825127872611107e+76
Mean Absolute Error (MAE): 9.840332169946151e+75
R-squared: -1.0825370967508432e+153
Predicted: -1.009508824372925e+77, Actual: 8.87
Predicted: -1.8588729614212607e+76, Actual: 7.86
Predicted: -2.2278001046772726e+76, Actual: 7.76
Predicted: -1.9068245315761994e+77, Actual: 8.35
Predicted: -1.0056574854483762e+77, Actual: 8.55
Predicted: -2.8526894436305605e+76, Actual: 8.71
Predicted: -8.951238885727611e+76, Actual: 8.16
Predicted: -5.593975568605258e+75, Actual: 7.29
Predicted: -2.099670804757293e+76, Actual: 7.41
Predicted: -3.202981859509633e+76, Actual: 8.03


In [16]:
from pyspark.sql import SparkSession
import os

# Print Python environment on the driver
print(f"Driver Python executable: {os.environ.get('PYSPARK_DRIVER_PYTHON', 'Default Python')}")
print(f"Executor Python executable: {os.environ.get('PYSPARK_PYTHON', 'Default Python')}")

# Create a Spark session
spark = SparkSession.builder.appName("Check Python Path").getOrCreate()

# Print the Python environment used on workers
rdd = spark.sparkContext.parallelize([1])
def print_worker_python(_):
    import sys
    return sys.executable

worker_python_path = rdd.map(print_worker_python).collect()
print(f"Worker Python executable: {worker_python_path}")

spark.stop()


Driver Python executable: Default Python
Executor Python executable: Default Python
Worker Python executable: ['/usr/bin/python3']
