In [None]:
import kagglehub
from pyspark.sql import SparkSession

path = kagglehub.dataset_download('amrahhasanov23/otodom-pl-flat-prices-in-poland')

spark = SparkSession.builder.appName('lab_10').getOrCreate()

In [None]:
df = spark.read.csv(path, header=True, inferSchema=True)

In [None]:
df.show(), df.printSchema()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|   Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|               City|
+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+-------------------+
|2 pokoje 47m2 po ...| 415000.0|ul. Marysińska, S...|   47.0|             2 |    NULL|    do zamieszkania|miejskie|garaż/miejsce par...|                balkon|https://www.otodo...|           Łódzkie|               Łódź|
|Właściciel- Ludwi...|2499000.0|ul. Ludwiki, Czys...|  105.0|             4 |     2/8|     do wykończenia|miejskie|garaż

(None, None)

In [None]:
import pyspark.sql.functions as f

missing_counts = df.select(
    [(f.count(f.when(f.col(c).isNull(), 1)).alias(c)) for c in df.columns]
).collect()[0]

for col_name, missing_count in zip(df.columns, missing_counts):
    print(f"Kolumna '{col_name}': {missing_count} brakujących wartości")

Kolumna 'Title': 0 brakujących wartości
Kolumna 'Price': 143 brakujących wartości
Kolumna 'Location': 1 brakujących wartości
Kolumna 'Surface': 0 brakujących wartości
Kolumna 'Number_of_Rooms': 0 brakujących wartości
Kolumna 'Floor': 64 brakujących wartości
Kolumna 'Finishing_Condition': 578 brakujących wartości
Kolumna 'Heating': 746 brakujących wartości
Kolumna 'Parking_Space': 1328 brakujących wartości
Kolumna 'Balcony_Garden_Terrace': 704 brakujących wartości
Kolumna 'Link': 1 brakujących wartości
Kolumna 'Voivodeship': 2 brakujących wartości
Kolumna 'City': 2 brakujących wartości


In [None]:
df = df.dropna(subset=['Price', 'Location', 'Floor', 'Finishing_Condition', 'Heating', 'Parking_Space', 'Balcony_Garden_Terrace', 'Link', 'Voivodeship', 'City'])
df.show()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|   Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|        City|
+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+------------+
|Właściciel- Ludwi...|2499000.0|ul. Ludwiki, Czys...|  105.0|             4 |     2/8|     do wykończenia|miejskie|garaż/miejsce par...|                balkon|https://www.otodo...|       Mazowieckie|    Warszawa|
|2/3-pok. 49,2 m2 ...| 450180.0|ul. Bartosza Głow...|   49.2|             2 |     2/3|    do zamieszkania|miejskie|garaż/miejsce par...|            

In [None]:
from pyspark.sql.functions import col
df = df.withColumn("Surface", col("Surface").cast("double"))
df = df.withColumn("Number_of_Rooms", col("Number_of_Rooms").cast("int"))

In [None]:
unique_values = {col: df.select(col).distinct().count() for col in df.columns}

for col_name, unique_count in unique_values.items():
    print(f"Kolumna '{col_name}': {unique_count} unikalnych wartości")

Kolumna 'Title': 708 unikalnych wartości
Kolumna 'Price': 481 unikalnych wartości
Kolumna 'Location': 588 unikalnych wartości
Kolumna 'Surface': 545 unikalnych wartości
Kolumna 'Number_of_Rooms': 8 unikalnych wartości
Kolumna 'Floor': 82 unikalnych wartości
Kolumna 'Finishing_Condition': 3 unikalnych wartości
Kolumna 'Heating': 6 unikalnych wartości
Kolumna 'Parking_Space': 1 unikalnych wartości
Kolumna 'Balcony_Garden_Terrace': 8 unikalnych wartości
Kolumna 'Link': 719 unikalnych wartości
Kolumna 'Voivodeship': 16 unikalnych wartości
Kolumna 'City': 137 unikalnych wartości


In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

categorical_columns = ['Balcony_Garden_Terrace', 'Finishing_Condition', 'Heating']

for col_name in categorical_columns:

    indexer = StringIndexer(inputCol=col_name, outputCol=f'{col_name}_numeric')
    indexer_fitted = indexer.fit(df)
    df = indexer_fitted.transform(df)

    encoder = OneHotEncoder(inputCols=[f'{col_name}_numeric'], outputCols=[f'{col_name}_onehot'])
    df = encoder.fit(df).transform(df)

df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: double (nullable = true)
 |-- Number_of_Rooms: integer (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Balcony_Garden_Terrace_numeric: double (nullable = false)
 |-- Balcony_Garden_Terrace_onehot: vector (nullable = true)
 |-- Finishing_Condition_numeric: double (nullable = false)
 |-- Finishing_Condition_onehot: vector (nullable = true)
 |-- Heating_numeric: double (nullable = false)
 |-- Heating_onehot: vector (nullable = true)



In [None]:
df.show()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+------------+------------------------------+-----------------------------+---------------------------+--------------------------+---------------+--------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|   Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|        City|Balcony_Garden_Terrace_numeric|Balcony_Garden_Terrace_onehot|Finishing_Condition_numeric|Finishing_Condition_onehot|Heating_numeric|Heating_onehot|
+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+------------+------------------------------+-----------------------------+-------

In [None]:
from pyspark.ml.functions import vector_to_array

df_col_onehot = df.select('*', vector_to_array('Balcony_Garden_Terrace_onehot').alias('balc_onehot'))

In [None]:
df_col_onehot.show()

+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+------------+------------------------------+-----------------------------+---------------------------+--------------------------+---------------+--------------+--------------------+
|               Title|    Price|            Location|Surface|Number_of_Rooms|   Floor|Finishing_Condition| Heating|       Parking_Space|Balcony_Garden_Terrace|                Link|       Voivodeship|        City|Balcony_Garden_Terrace_numeric|Balcony_Garden_Terrace_onehot|Finishing_Condition_numeric|Finishing_Condition_onehot|Heating_numeric|Heating_onehot|         balc_onehot|
+--------------------+---------+--------------------+-------+---------------+--------+-------------------+--------+--------------------+----------------------+--------------------+------------------+------------+--------------------------

In [None]:
from pyspark.ml.functions import vector_to_array
import pyspark.sql.functions as f

categorical_columns = ['Balcony_Garden_Terrace', 'Finishing_Condition', 'Heating']

df_cols_onehot = df

for col_name in categorical_columns:
    labels = df.select(col_name).distinct().rdd.map(lambda x: x[0]).collect()
    labels.sort()

    df_cols_onehot = df_cols_onehot.withColumn(f'{col_name}_array', vector_to_array(f'{col_name}_onehot'))

    expanded_cols = [
        f.col(f'{col_name}_array')[i].alias(f'{col_name}_{labels[i]}') for i in range(len(labels))
    ]

    df_cols_onehot = df_cols_onehot.select('*', *expanded_cols)

df_cols_onehot.printSchema()
df_cols_onehot.show()

root
 |-- Title: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Surface: double (nullable = true)
 |-- Number_of_Rooms: integer (nullable = true)
 |-- Floor: string (nullable = true)
 |-- Finishing_Condition: string (nullable = true)
 |-- Heating: string (nullable = true)
 |-- Parking_Space: string (nullable = true)
 |-- Balcony_Garden_Terrace: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Voivodeship: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Balcony_Garden_Terrace_numeric: double (nullable = false)
 |-- Balcony_Garden_Terrace_onehot: vector (nullable = true)
 |-- Finishing_Condition_numeric: double (nullable = false)
 |-- Finishing_Condition_onehot: vector (nullable = true)
 |-- Heating_numeric: double (nullable = false)
 |-- Heating_onehot: vector (nullable = true)
 |-- Balcony_Garden_Terrace_array: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- Bal

In [None]:
df_regr = df_cols_onehot.select(
    "Surface",
    "Number_of_Rooms",
    "Balcony_Garden_Terrace_balkon",
    "Finishing_Condition_do wykończenia",
    "Heating_gazowe",
    "Price"
)

In [None]:
df_regr.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in df_regr.columns]).show()

+-------+---------------+-----------------------------+----------------------------------+--------------+-----+
|Surface|Number_of_Rooms|Balcony_Garden_Terrace_balkon|Finishing_Condition_do wykończenia|Heating_gazowe|Price|
+-------+---------------+-----------------------------+----------------------------------+--------------+-----+
|      0|              1|                            0|                                 0|             0|    0|
+-------+---------------+-----------------------------+----------------------------------+--------------+-----+



In [None]:
df_regr_clean = df_regr.dropna(subset=["Number_of_Rooms"])

In [None]:
df_regr_clean.show()

+-------+---------------+-----------------------------+----------------------------------+--------------+---------+
|Surface|Number_of_Rooms|Balcony_Garden_Terrace_balkon|Finishing_Condition_do wykończenia|Heating_gazowe|    Price|
+-------+---------------+-----------------------------+----------------------------------+--------------+---------+
|  105.0|              4|                          1.0|                               1.0|           0.0|2499000.0|
|   49.2|              2|                          1.0|                               0.0|           0.0| 450180.0|
|   76.4|              3|                          1.0|                               0.0|           0.0| 649000.0|
|   45.0|              2|                          1.0|                               0.0|           0.0| 555000.0|
|  62.52|              3|                          0.0|                               0.0|           0.0| 555000.0|
|  63.95|              3|                          1.0|                 

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=df_regr_clean.columns[:-1], outputCol='features')

data_set = assembler.transform(df_regr_clean)
data_set = data_set.select(['features', 'Price'])
data_set.show()

+--------------------+---------+
|            features|    Price|
+--------------------+---------+
|[105.0,4.0,1.0,1....|2499000.0|
|[49.2,2.0,1.0,0.0...| 450180.0|
|[76.4,3.0,1.0,0.0...| 649000.0|
|[45.0,2.0,1.0,0.0...| 555000.0|
|(5,[0,1],[62.52,3...| 555000.0|
|[63.95,3.0,1.0,0....| 999999.0|
|[121.8,3.0,0.0,1....|2923000.0|
|[119.86,5.0,1.0,1...| 650000.0|
|[64.64,3.0,1.0,0....| 900000.0|
|[65.86,3.0,1.0,1....| 629000.0|
|[100.6,5.0,1.0,1....| 965760.0|
|[60.0,3.0,1.0,0.0...| 350000.0|
|[32.0,2.0,1.0,0.0...| 330000.0|
|[43.24,2.0,1.0,1....| 389160.0|
|[35.06,2.0,1.0,1....| 315540.0|
|[35.06,2.0,1.0,1....| 315540.0|
|[74.91,4.0,1.0,0....| 881000.0|
|[60.0,3.0,0.0,1.0...| 350000.0|
|[33.42,1.0,1.0,0....| 769000.0|
|[40.0,2.0,1.0,1.0...| 367000.0|
+--------------------+---------+
only showing top 20 rows



In [None]:
train_data, test_data = data_set.randomSplit([0.8, 0.2])

from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol='Price', regParam=0.1)

lrModel = lr.fit(train_data)

test_stats = lrModel.evaluate(test_data)

print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"MSE: {test_stats.meanSquaredError}")

RMSE: 5309958.251514914
R2: 0.011576375585929055
MSE: 28195656632831.33


In [None]:
df_regr_2 = df_cols_onehot.select(
    "Surface",
    "Number_of_Rooms",
    "Balcony_Garden_Terrace_ogródek, taras",
    "Finishing_Condition_do remontu",
    "Heating_elektryczne",
    "Price"
)

In [None]:
df_regr_2.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in df_regr_2.columns]).show()

+-------+---------------+-------------------------------------+------------------------------+-------------------+-----+
|Surface|Number_of_Rooms|Balcony_Garden_Terrace_ogródek, taras|Finishing_Condition_do remontu|Heating_elektryczne|Price|
+-------+---------------+-------------------------------------+------------------------------+-------------------+-----+
|      0|              1|                                    0|                             0|                  0|    0|
+-------+---------------+-------------------------------------+------------------------------+-------------------+-----+



In [None]:
df_regr_2_clean = df_regr_2.dropna(subset=["Number_of_Rooms"])

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=df_regr_2_clean.columns[:-1], outputCol='features')

data_set = assembler.transform(df_regr_2_clean)
data_set = data_set.select(['features', 'Price'])
data_set.show()

+--------------------+---------+
|            features|    Price|
+--------------------+---------+
|[105.0,4.0,0.0,0....|2499000.0|
|[49.2,2.0,0.0,1.0...| 450180.0|
|[76.4,3.0,0.0,1.0...| 649000.0|
|[45.0,2.0,0.0,1.0...| 555000.0|
|[62.52,3.0,0.0,1....| 555000.0|
|[63.95,3.0,0.0,1....| 999999.0|
|(5,[0,1],[121.8,3...|2923000.0|
|(5,[0,1],[119.86,...| 650000.0|
|[64.64,3.0,0.0,1....| 900000.0|
|[65.86,3.0,0.0,0....| 629000.0|
|[100.6,5.0,0.0,0....| 965760.0|
|[60.0,3.0,0.0,1.0...| 350000.0|
|[32.0,2.0,0.0,1.0...| 330000.0|
|(5,[0,1],[43.24,2...| 389160.0|
|(5,[0,1],[35.06,2...| 315540.0|
|(5,[0,1],[35.06,2...| 315540.0|
|[74.91,4.0,0.0,1....| 881000.0|
|[60.0,3.0,0.0,0.0...| 350000.0|
|[33.42,1.0,0.0,1....| 769000.0|
|[40.0,2.0,0.0,0.0...| 367000.0|
+--------------------+---------+
only showing top 20 rows



In [None]:
train_data, test_data = data_set.randomSplit([0.8, 0.2])

lr = LinearRegression(featuresCol="features", labelCol='Price', regParam=0.1)

lrModel = lr.fit(train_data)

test_stats = lrModel.evaluate(test_data)

print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"MSE: {test_stats.meanSquaredError}")

RMSE: 7032097.157257457
R2: 0.023216103328202453
MSE: 49450390429108.41
