https://www.kdnuggets.com/2022/03/loss-functions-explainer.html

https://machinelearningmastery.com/rfe-feature-selection-in-python/

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [1]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when

In [2]:
import numpy as np
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from IPython.display import display, HTML
import warnings

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

In [6]:
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [7]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

In [8]:
# Linear Regression
from sklearn.linear_model import LinearRegression

# Ridge Regression
from sklearn.linear_model import Ridge

# Lasso Regression
from sklearn.linear_model import Lasso

# ElasticNet Regression
from sklearn.linear_model import ElasticNet

# Support Vector Regression (SVR)
from sklearn.svm import SVR

# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor

# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

# Gradient Boosting Regression
from sklearn.ensemble import GradientBoostingRegressor

# AdaBoost Regression
from sklearn.ensemble import AdaBoostRegressor

# XGBoost Regression
from xgboost import XGBRegressor

# LightGBM Regression
# from lightgbm import LGBMRegressor

# CatBoost Regression
# from catboost import CatBoostRegressor


In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [10]:
display(HTML("<style>.container { width:100% !important; }</style>"))
warnings.filterwarnings('ignore')
# pd.options.display.float_format = "{:.2f}".format

In [11]:
pyspark.__version__

'3.3.2'

In [12]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('regression') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/21 21:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/21 21:25:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/21 21:25:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [13]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "True") \
    .csv("/home/konradballegro/scripts/scraper/outputs/data/offers.csv")

                                                                                

In [14]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  129589
Number of columns:  230


In [15]:
headers = df.columns
for h, head in enumerate(headers):
    print(f"{h}: {head}")

0: Offer from
1: Category
2: Show offers with VIN number
3: Has registration number
4: Vehicle brand
5: Vehicle model
6: Version
7: Generation
8: Year of production
9: Mileage
10: Engine capacity
11: Fuel type
12: Power
13: Gearbox
14: Range
15: Drive
16: Battery capacity
17: Battery ownership type
18: CO2 emissions
19: Particulate filter
20: City fuel consumption
21: Body type
22: Number of doors
23: Number of seats
24: Color
25: Metallic
26: Color type
27: Right-hand drive (Anglik)
28: Country of origin
29: Leasing
30: VAT margin
31: VAT invoice
32: Manufacturer warranty period
33: Financing possibility
34: First registration
35: Registered in Poland
36: First owner
37: Accident-free
38: Serviced at authorized service center
39: Condition
40: ABS
41: Apple CarPlay
42: Android Auto
43: Rear side airbags43
44: Driver side airbag
45: CD
46: Central locking
47: Electric front windows
48: Electrically adjustable passenger seat
49: Electrically adjustable mirrors
50: Immobilizer
51: Driver

In [16]:
# Sample 20% of the rows without replacement
sampled_df = df.sample(withReplacement=False, fraction=0.2)

In [17]:
# Count the number of rows in the DataFrame
num_rows = sampled_df.count()

# Count the number of columns in the DataFrame
num_cols = len(sampled_df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  26061
Number of columns:  230


In [18]:
price = sampled_df.select("Price").rdd.flatMap(lambda x: x).collect()

                                                                                

In [19]:
# # Assuming you have a PySpark DataFrame called 'sampled_df' with a column 'Price'
# # Extract the 'Price' column as a list
# price_data = sampled_df.select("Price").rdd.flatMap(lambda x: x).collect()

# # Create a histogram using Seaborn
# sns.histplot(data=price_data, bins=10, kde=True)

# # Customize the plot
# plt.xlabel("Price")
# plt.ylabel("Frequency")
# plt.title("Histogram of Prices")

# # Show the plot
# plt.show()


In [20]:
df = df.filter((df["Currency"] == "PLN") &
                        (df["Country of origin"] == "Polska") &
                        (df["Accident-free"].isNotNull()) &
                        (df["Price"].isNotNull()) &
                        (df["Offer from"].isNotNull()) &
                        (df["Condition"].isNotNull()) &
                        (df["Vehicle brand"].isNotNull()) &
                        (df["Vehicle model"].isNotNull()) &
                        (df["Year of production"].isNotNull()) &
                        (df["Mileage"].isNotNull()) &
                        (df["Fuel type"].isNotNull()) &
                        (df["Power"].isNotNull()) &
                        (df["Gearbox"].isNotNull()) &
                        (df["Body type"].isNotNull()) &
                        (df["Number of doors"].isNotNull())
                       ).select(col("Price").cast("float").alias("Price"),
                                "Offer from",
                                "Condition",
                                "Vehicle brand",
                                "Vehicle model",
                                col("Year of production").cast("string").alias("Year of production"),
                                regexp_replace(regexp_replace(col("Mileage"), " ", ""), "km", "").cast("float").alias("Mileage"),
                                "Fuel type",
                                regexp_replace(regexp_replace(col("Power"), " ", ""), "KM", "").cast("integer").alias("Power"),
                                "Gearbox",
                                "Body type",
                                "Number of doors",
                                "URL path",
                                "ID",
                                "Epoch"
                               )

In [21]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

[Stage 9:>                                                          (0 + 4) / 4]

Number of rows:  33225
Number of columns:  15


                                                                                

In [22]:
df.show()

+--------+---------------+---------+-------------+-------------+------------------+--------+-----------+-----+------------+-------------+---------------+--------------------+--------------------+----------+
|   Price|     Offer from|Condition|Vehicle brand|Vehicle model|Year of production| Mileage|  Fuel type|Power|     Gearbox|    Body type|Number of doors|            URL path|                  ID|     Epoch|
+--------+---------------+---------+-------------+-------------+------------------+--------+-----------+-----+------------+-------------+---------------+--------------------+--------------------+----------+
| 28000.0|Osoby prywatnej|  Używane|        Honda|        Civic|              2010|189347.0|    Benzyna|  140|    Manualna|        Sedan|            4.0|https://www.otomo...|000d1349f23a1d685...|1687241613|
| 92900.0|Osoby prywatnej|  Używane|        Honda|        Civic|              2018| 81240.0|    Benzyna|  182|    Manualna|      Kompakt|            5.0|https://www.otomo..

In [23]:
distinct_offers = df.select("Offer from").distinct().rdd.flatMap(lambda x: x).collect()
distinct_conditions = df.select("Condition").distinct().rdd.flatMap(lambda x: x).collect()
distinct_brands = df.select("Vehicle brand").distinct().rdd.flatMap(lambda x: x).collect()
distinct_models = df.select("Vehicle model").distinct().rdd.flatMap(lambda x: x).collect()
distinct_years = df.select("Year of production").distinct().rdd.flatMap(lambda x: x).collect()
distinct_fuel = df.select("Fuel type").distinct().rdd.flatMap(lambda x: x).collect()
distinct_gearbox = df.select("Gearbox").distinct().rdd.flatMap(lambda x: x).collect()
distinct_body = df.select("Body type").distinct().rdd.flatMap(lambda x: x).collect()
distinct_doors = df.select("Number of doors").distinct().rdd.flatMap(lambda x: x).collect()

                                                                                

In [24]:
for offer in distinct_offers:
    column_name = "Offer_type_" + offer.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Offer from"] == offer, 1).otherwise(0))

In [25]:
for condition in distinct_conditions:
    column_name = "Condition_" + condition.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Condition"] == condition, 1).otherwise(0))

In [26]:
for model in distinct_models:
    column_name = "Vehicle_model_" + model.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Vehicle model"] == model, 1).otherwise(0))

In [None]:
for year in distinct_years:
    column_name = "Year_of_production_" + str(year)
    df = df.withColumn(column_name, when(df["Year of production"] == year, 1).otherwise(0))

In [None]:
for fuel in distinct_fuel:
    column_name = "Fuel_type_" + fuel.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Fuel type"] == fuel, 1).otherwise(0))

In [None]:
for gearbox in distinct_gearbox:
    column_name = "Gearbox_" + gearbox.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Gearbox"] == gearbox, 1).otherwise(0))

In [None]:
for body in distinct_body:
    column_name = "Body_type_" + body.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Body type"] == body, 1).otherwise(0))

In [None]:
for doors in distinct_doors:
    column_name = "Number_of_doors_" + str(doors)
    df = df.withColumn(column_name, when(df["Number of doors"] == doors, 1).otherwise(0))

In [None]:
# Assuming your DataFrame is called df
columns_to_drop = ["Offer from", "Condition", "Vehicle brand", "Vehicle model", "Year of production", "Fuel type", "Gearbox", "Body type", "Number of doors", "URL path", "ID", "Epoch"]

# Drop the specified columns
df = df.drop(*columns_to_drop)

In [None]:
df = df.filter(df["Price"].isNotNull())

In [None]:
df_pandas = df.toPandas()

In [None]:
df_pandas

In [None]:
df_pandas.isnull().sum()

In [None]:
df_pandas.to_csv('/home/konradballegro/notebooks/outputs/data/offers.csv')

In [None]:
y = df_pandas["Price"]
y

In [None]:
X = df_pandas.loc[:, ~df_pandas.columns.isin(["Price"])]
X.head()

In [None]:
train_size = 0.8

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and remaining data
X_train, X_remain, y_train, y_remain = train_test_split(X, y, test_size=0.3, random_state=42)

# Split remaining data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_remain, y_remain, test_size=0.5, random_state=42)


In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
corr_matrix = X_train.corr(method = "spearman").abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_train = X_train.drop(to_drop, axis = 1)
X_val = X_val.drop(to_drop, axis = 1)
X_test = X_test.drop(to_drop, axis = 1)

In [None]:
X_train

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame containing the data
subset = X_train[["Mileage", "Power"]]

scaler = StandardScaler()
scaler.fit(subset)
subset_scaled = scaler.transform(subset)

X_train["Mileage"] = subset_scaled[:, 0]
X_train["Power"] = subset_scaled[:, 1]

In [None]:
X_train

neg_mean_absolute_error

In [None]:
selected_features = [
    'Mileage',
    'Power',
    'Condition_Używane',
    'Gearbox_Manualna',
    'Fuel_type_Benzyna',
    'Fuel_type_Diesel',
    'Fuel_type_Elektryczny',
    'Body_type_SUV',
    'Body_type_Minivan',
    'Number_of_doors_2.0'
]

In [None]:
selected_features

In [None]:
X_train_copy = X_train.copy()

In [None]:
X_train = X_train[selected_features]
X_val = X_val[selected_features]
X_test = X_test[selected_features]

In [None]:
X_train

In [None]:
loaded_model = XGBRegressor()
loaded_model.load_model('/home/konradballegro/notebooks/outputs/models/xgb_model.model')

In [None]:
# Use the loaded model for predictions
y_pred = loaded_model.predict(X_test)

In [None]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
from explainerdashboard import RegressionExplainer