In [6]:
pip install pyspark



In [7]:
pip install findspark



In [9]:
import os
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType
from pyspark.sql.functions import split, count, when, isnan, col, regexp_replace
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
spark = SparkSession.builder.appName('First Session').getOrCreate()
print('Spark Version: {}'.format(spark.version))

Spark Version: 3.5.0


In [28]:
schema = StructType([StructField('Motor', StringType(), nullable = True),
                     StructField('Tahun', IntegerType(), nullable = True),
                     StructField('Harga', IntegerType(), nullable = True),
                     StructField('Alamat', StringType(), nullable = True)])

file_path = 'olx-motorbekas.csv'
df = spark.read.csv(file_path,
                    header = True,
                    inferSchema = True,
                    nanValue = '?')
df.show(5)

+--------------------+-----+-------------+--------------------+
|               Motor|Tahun|        Harga|              Alamat|
+--------------------+-----+-------------+--------------------+
|MIO SOUL GT 125 G...| 2018| Rp 8.700.000|Bekasi Utara, Bek...|
|Honda Beat ECO 20...| 2019| Rp 8.900.000|Bekasi Utara, Bek...|
|Sepeda Motor List...| 2023|Rp 10.000.000|Medan Satria, Bek...|
|Dijual Yamaha Fin...| 2019| Rp 7.600.000|Bekasi Utara, Bek...|
|Murah !!! Dijual ...| 2023| Rp 8.500.000|Medan Satria, Bek...|
+--------------------+-----+-------------+--------------------+
only showing top 5 rows



In [29]:
def check_missing(dataframe):

    return dataframe.select([count(when(isnan(c) | col(c).isNull(), c)). \
                             alias(c) for c in dataframe.columns]).show()

check_missing(df)

+-----+-----+-----+------+
|Motor|Tahun|Harga|Alamat|
+-----+-----+-----+------+
|    0|    0|    0|     0|
+-----+-----+-----+------+



In [30]:
df = df.na.drop()

df = df.withColumn("Harga", df["Harga"].cast(StringType()))

df.show(5)

+--------------------+-----+-------------+--------------------+
|               Motor|Tahun|        Harga|              Alamat|
+--------------------+-----+-------------+--------------------+
|MIO SOUL GT 125 G...| 2018| Rp 8.700.000|Bekasi Utara, Bek...|
|Honda Beat ECO 20...| 2019| Rp 8.900.000|Bekasi Utara, Bek...|
|Sepeda Motor List...| 2023|Rp 10.000.000|Medan Satria, Bek...|
|Dijual Yamaha Fin...| 2019| Rp 7.600.000|Bekasi Utara, Bek...|
|Murah !!! Dijual ...| 2023| Rp 8.500.000|Medan Satria, Bek...|
+--------------------+-----+-------------+--------------------+
only showing top 5 rows



In [32]:
df.columns

['Motor', 'Tahun', 'Harga', 'Alamat']

In [33]:
['Motor',
 'Tahun',
 'Harga',
 'Alamat']

['Motor', 'Tahun', 'Harga', 'Alamat']

In [34]:
df.toPandas().head()

Unnamed: 0,Motor,Tahun,Harga,Alamat
0,MIO SOUL GT 125 GRESS,2018,Rp 8.700.000,"Bekasi Utara, Bekasi Kota"
1,Honda Beat ECO 2019 ss lengkap mesin bagus pla...,2019,Rp 8.900.000,"Bekasi Utara, Bekasi Kota"
2,Sepeda Motor Listrik Uwinfly T5 Super,2023,Rp 10.000.000,"Medan Satria, Bekasi Kota"
3,Dijual Yamaha Fino grande 2019 dof,2019,Rp 7.600.000,"Bekasi Utara, Bekasi Kota"
4,Murah !!! Dijual Motor Listrik Mirip Vespa Matic,2023,Rp 8.500.000,"Medan Satria, Bekasi Kota"


In [35]:
df.printSchema()

root
 |-- Motor: string (nullable = true)
 |-- Tahun: integer (nullable = true)
 |-- Harga: string (nullable = true)
 |-- Alamat: string (nullable = true)



In [36]:
for motor in df.head(4):
    print(motor, '\n')

Row(Motor='MIO SOUL GT 125 GRESS', Tahun=2018, Harga='Rp 8.700.000', Alamat='Bekasi Utara, Bekasi Kota') 

Row(Motor='Honda Beat ECO 2019 ss lengkap mesin bagus plat B Tangerang', Tahun=2019, Harga='Rp 8.900.000', Alamat='Bekasi Utara, Bekasi Kota') 

Row(Motor='Sepeda Motor Listrik Uwinfly T5 Super', Tahun=2023, Harga='Rp 10.000.000', Alamat='Medan Satria, Bekasi Kota') 

Row(Motor='Dijual Yamaha Fino grande 2019 dof', Tahun=2019, Harga='Rp 7.600.000', Alamat='Bekasi Utara, Bekasi Kota') 



In [37]:
df.describe().show()

+-------+--------------------+------------------+-------------+--------------------+
|summary|               Motor|             Tahun|        Harga|              Alamat|
+-------+--------------------+------------------+-------------+--------------------+
|  count|                1389|              1389|         1389|                1389|
|   mean|                NULL|2021.4046076313896|         NULL|                NULL|
| stddev|                NULL|1.9071778778100124|         NULL|                NULL|
|    min|!! YAMAHA FAZZIO ...|              2018|Rp 10.000.000|Babelan, Bekasi Kab.|
|    max|•Honda Beat stree...|              2023| Rp 9.999.000|Tebet, Jakarta Se...|
+-------+--------------------+------------------+-------------+--------------------+



In [38]:
df.describe(['Motor', 'Harga']).show()

+-------+--------------------+-------------+
|summary|               Motor|        Harga|
+-------+--------------------+-------------+
|  count|                1389|         1389|
|   mean|                NULL|         NULL|
| stddev|                NULL|         NULL|
|    min|!! YAMAHA FAZZIO ...|Rp 10.000.000|
|    max|•Honda Beat stree...| Rp 9.999.000|
+-------+--------------------+-------------+



In [39]:
def get_num_cols(dataframe):

    num_cols = [col for col in dataframe.columns if dataframe.select(col). \
                dtypes[0][1] in ['double', 'int']]

    return num_cols

num_cols = get_num_cols(df)

df.describe(num_cols).show()

+-------+------------------+
|summary|             Tahun|
+-------+------------------+
|  count|              1389|
|   mean|2021.4046076313896|
| stddev|1.9071778778100124|
|    min|              2018|
|    max|              2023|
+-------+------------------+



In [42]:
df.filter(df['Tahun'] > 2022).show(5)

+--------------------+-----+-------------+--------------------+
|               Motor|Tahun|        Harga|              Alamat|
+--------------------+-----+-------------+--------------------+
|Sepeda Motor List...| 2023|Rp 10.000.000|Medan Satria, Bek...|
|Murah !!! Dijual ...| 2023| Rp 8.500.000|Medan Satria, Bek...|
|Motor Listrik Uwi...| 2023| Rp 9.900.000|Bekasi Barat, Bek...|
|2023 Uwinfly T3s ...| 2023| Rp 6.800.000|Bekasi Selatan, B...|
|Motor Listrik Uwi...| 2023| Rp 9.900.000|Bekasi Barat, Bek...|
+--------------------+-----+-------------+--------------------+
only showing top 5 rows

