In [3]:
from pyspark.sql import SparkSession

In [4]:
from pyspark.sql import SparkSession

# Membuat SparkSession
spark = SparkSession.builder.appName("Supermarket Sales Analysis with Training").getOrCreate()

# Membaca dataset
file_path = "supermarket_sales - Sheet1.csv"  # Pastikan file berada di lokasi yang sama
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Menampilkan skema data
df.printSchema()
df.show(5)

root
 |-- Invoice ID: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Unit price: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Tax 5%: double (nullable = true)
 |-- Total: double (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Payment: string (nullable = true)
 |-- cogs: double (nullable = true)
 |-- gross margin percentage: double (nullable = true)
 |-- gross income: double (nullable = true)
 |-- Rating: double (nullable = true)

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+---------+-------------------+-----------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity| Tax 5%|   

In [5]:
# Memilih kolom yang relevan
df_selected = df.select("Unit price", "Quantity", "Total", "Tax 5%", "Rating")

# Menghapus baris dengan nilai null
df_cleaned = df_selected.dropna()

# Menampilkan data setelah dibersihkan
df_cleaned.show(5)

+----------+--------+--------+-------+------+
|Unit price|Quantity|   Total| Tax 5%|Rating|
+----------+--------+--------+-------+------+
|     74.69|       7|548.9715|26.1415|   9.1|
|     15.28|       5|   80.22|   3.82|   9.6|
|     46.33|       7|340.5255|16.2155|   7.4|
|     58.22|       8| 489.048| 23.288|   8.4|
|     86.31|       7|634.3785|30.2085|   5.3|
+----------+--------+--------+-------+------+
only showing top 5 rows



In [6]:
from pyspark.ml.feature import VectorAssembler

# Menggabungkan kolom fitur
feature_columns = ["Unit price", "Quantity", "Total", "Tax 5%"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Menambahkan kolom fitur ke DataFrame
df_features = assembler.transform(df_cleaned)

# Menampilkan data dengan kolom fitur
df_features.select("features", "Rating").show(5)

+--------------------+------+
|            features|Rating|
+--------------------+------+
|[74.69,7.0,548.97...|   9.1|
|[15.28,5.0,80.22,...|   9.6|
|[46.33,7.0,340.52...|   7.4|
|[58.22,8.0,489.04...|   8.4|
|[86.31,7.0,634.37...|   5.3|
+--------------------+------+
only showing top 5 rows



In [7]:
# Membagi data menjadi training dan testing
train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

print(f"Jumlah data latih: {train_data.count()}")
print(f"Jumlah data uji: {test_data.count()}")

Jumlah data latih: 838
Jumlah data uji: 162


In [8]:
from pyspark.ml.regression import LinearRegression

# Membuat model regresi linier
lr = LinearRegression(featuresCol="features", labelCol="Rating", predictionCol="prediction")

# Melatih model pada data latih
lr_model = lr.fit(train_data)

# Menampilkan koefisien dan intercept
print(f"Koefisien: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

Koefisien: [0.007178226344261361,0.06323490118032725,-0.0006097756155870004,-0.0128052879273355]
Intercept: 6.6194781648684575


In [9]:
# Membuat prediksi pada data uji
predictions = lr_model.transform(test_data)

# Menampilkan hasil prediksi
predictions.select("features", "Rating", "prediction").show(5)

# Menghitung metrik evaluasi
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="Rating", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

+--------------------+------+------------------+
|            features|Rating|        prediction|
+--------------------+------+------------------+
|[10.16,5.0,53.34,...|   4.1|6.9435325877569465|
|[10.56,8.0,88.704...|   7.6| 7.092980372096381|
|[10.69,5.0,56.122...|   7.6| 6.943943646418662|
|[11.43,6.0,72.009...|   7.7| 6.993116034459691|
|[12.05,5.0,63.262...|   5.5| 6.944998438456272|
+--------------------+------+------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE): 1.6902525632142653


In [10]:
spark.stop()