https://www.kdnuggets.com/2022/03/loss-functions-explainer.html

https://machinelearningmastery.com/rfe-feature-selection-in-python/

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [1]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when

In [2]:
import numpy as np
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from IPython.display import display, HTML
import warnings

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

In [6]:
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [66]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, d2_absolute_error_score
from sklearn.model_selection import ParameterGrid

In [8]:
# Linear Regression
from sklearn.linear_model import LinearRegression

# Ridge Regression
from sklearn.linear_model import Ridge

# Lasso Regression
from sklearn.linear_model import Lasso

# ElasticNet Regression
from sklearn.linear_model import ElasticNet

# Support Vector Regression (SVR)
from sklearn.svm import SVR

# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor

# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

# Gradient Boosting Regression
from sklearn.ensemble import GradientBoostingRegressor

# AdaBoost Regression
from sklearn.ensemble import AdaBoostRegressor

# XGBoost Regression
from xgboost import XGBRegressor

# LightGBM Regression
# from lightgbm import LGBMRegressor

# CatBoost Regression
# from catboost import CatBoostRegressor


In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [10]:
display(HTML("<style>.container { width:100% !important; }</style>"))
warnings.filterwarnings('ignore')
# pd.options.display.float_format = "{:.2f}".format

In [11]:
pyspark.__version__

'3.3.2'

In [12]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('regression') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/21 23:01:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/21 23:01:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [13]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "True") \
    .csv("/home/konradballegro/scripts/scraper/outputs/data/offers.csv")

                                                                                

In [14]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  129589
Number of columns:  230


In [15]:
headers = df.columns
for h, head in enumerate(headers):
    print(f"{h}: {head}")

0: Offer from
1: Category
2: Show offers with VIN number
3: Has registration number
4: Vehicle brand
5: Vehicle model
6: Version
7: Generation
8: Year of production
9: Mileage
10: Engine capacity
11: Fuel type
12: Power
13: Gearbox
14: Range
15: Drive
16: Battery capacity
17: Battery ownership type
18: CO2 emissions
19: Particulate filter
20: City fuel consumption
21: Body type
22: Number of doors
23: Number of seats
24: Color
25: Metallic
26: Color type
27: Right-hand drive (Anglik)
28: Country of origin
29: Leasing
30: VAT margin
31: VAT invoice
32: Manufacturer warranty period
33: Financing possibility
34: First registration
35: Registered in Poland
36: First owner
37: Accident-free
38: Serviced at authorized service center
39: Condition
40: ABS
41: Apple CarPlay
42: Android Auto
43: Rear side airbags43
44: Driver side airbag
45: CD
46: Central locking
47: Electric front windows
48: Electrically adjustable passenger seat
49: Electrically adjustable mirrors
50: Immobilizer
51: Driver

In [16]:
# Sample 20% of the rows without replacement
sampled_df = df.sample(withReplacement=False, fraction=0.2)

In [17]:
# Count the number of rows in the DataFrame
num_rows = sampled_df.count()

# Count the number of columns in the DataFrame
num_cols = len(sampled_df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  25980
Number of columns:  230


In [18]:
price = sampled_df.select("Price").rdd.flatMap(lambda x: x).collect()

                                                                                

In [19]:
# # Assuming you have a PySpark DataFrame called 'sampled_df' with a column 'Price'
# # Extract the 'Price' column as a list
# price_data = sampled_df.select("Price").rdd.flatMap(lambda x: x).collect()

# # Create a histogram using Seaborn
# sns.histplot(data=price_data, bins=10, kde=True)

# # Customize the plot
# plt.xlabel("Price")
# plt.ylabel("Frequency")
# plt.title("Histogram of Prices")

# # Show the plot
# plt.show()


In [20]:
df = df.filter((df["Currency"] == "PLN") &
                        (df["Country of origin"] == "Polska") &
                        (df["Accident-free"].isNotNull()) &
                        (df["Price"].isNotNull()) &
                        (df["Offer from"].isNotNull()) &
                        (df["Condition"].isNotNull()) &
                        (df["Vehicle brand"].isNotNull()) &
#                         (df["Vehicle model"].isNotNull()) &
                        (df["Year of production"].isNotNull()) &
                        (df["Mileage"].isNotNull()) &
                        (df["Fuel type"].isNotNull()) &
                        (df["Power"].isNotNull()) &
                        (df["Gearbox"].isNotNull()) &
                        (df["Body type"].isNotNull()) &
                        (df["Number of doors"].isNotNull())
                       ).select(col("Price").cast("float").alias("Price"),
                                "Offer from",
                                "Condition",
                                "Vehicle brand",
                                "Vehicle model",
                                col("Year of production").cast("string").alias("Year of production"),
                                regexp_replace(regexp_replace(col("Mileage"), " ", ""), "km", "").cast("float").alias("Mileage"),
                                "Fuel type",
                                regexp_replace(regexp_replace(col("Power"), " ", ""), "KM", "").cast("integer").alias("Power"),
                                "Gearbox",
                                "Body type",
                                "Number of doors",
                                "URL path",
                                "ID",
                                "Epoch"
                               )

In [21]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

[Stage 9:>                                                          (0 + 4) / 4]

Number of rows:  33225
Number of columns:  15


                                                                                

In [22]:
df.show()

+--------+---------------+---------+-------------+-------------+------------------+--------+-----------+-----+------------+-------------+---------------+--------------------+--------------------+----------+
|   Price|     Offer from|Condition|Vehicle brand|Vehicle model|Year of production| Mileage|  Fuel type|Power|     Gearbox|    Body type|Number of doors|            URL path|                  ID|     Epoch|
+--------+---------------+---------+-------------+-------------+------------------+--------+-----------+-----+------------+-------------+---------------+--------------------+--------------------+----------+
| 28000.0|Osoby prywatnej|  Używane|        Honda|        Civic|              2010|189347.0|    Benzyna|  140|    Manualna|        Sedan|            4.0|https://www.otomo...|000d1349f23a1d685...|1687241613|
| 92900.0|Osoby prywatnej|  Używane|        Honda|        Civic|              2018| 81240.0|    Benzyna|  182|    Manualna|      Kompakt|            5.0|https://www.otomo..

In [23]:
distinct_offers = df.select("Offer from").distinct().rdd.flatMap(lambda x: x).collect()
distinct_conditions = df.select("Condition").distinct().rdd.flatMap(lambda x: x).collect()
distinct_brands = df.select("Vehicle brand").distinct().rdd.flatMap(lambda x: x).collect()
# distinct_models = df.select("Vehicle model").distinct().rdd.flatMap(lambda x: x).collect()
distinct_years = df.select("Year of production").distinct().rdd.flatMap(lambda x: x).collect()
distinct_fuel = df.select("Fuel type").distinct().rdd.flatMap(lambda x: x).collect()
distinct_gearbox = df.select("Gearbox").distinct().rdd.flatMap(lambda x: x).collect()
distinct_body = df.select("Body type").distinct().rdd.flatMap(lambda x: x).collect()
distinct_doors = df.select("Number of doors").distinct().rdd.flatMap(lambda x: x).collect()

                                                                                

In [24]:
for offer in distinct_offers:
    column_name = "Offer_type_" + offer.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Offer from"] == offer, 1).otherwise(0))

In [25]:
for condition in distinct_conditions:
    column_name = "Condition_" + condition.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Condition"] == condition, 1).otherwise(0))

In [26]:
for brand in distinct_brands:
    column_name = "Vehicle_brand_" + brand.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Vehicle brand"] == brand, 1).otherwise(0))

In [27]:
# for model in distinct_models:
#     column_name = "Vehicle_model_" + model.replace(" ", "_")
#     df = df.withColumn(column_name, when(df["Vehicle model"] == model, 1).otherwise(0))

In [28]:
for year in distinct_years:
    column_name = "Year_of_production_" + str(year)
    df = df.withColumn(column_name, when(df["Year of production"] == year, 1).otherwise(0))

In [29]:
for fuel in distinct_fuel:
    column_name = "Fuel_type_" + fuel.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Fuel type"] == fuel, 1).otherwise(0))

In [30]:
for gearbox in distinct_gearbox:
    column_name = "Gearbox_" + gearbox.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Gearbox"] == gearbox, 1).otherwise(0))

In [31]:
for body in distinct_body:
    column_name = "Body_type_" + body.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Body type"] == body, 1).otherwise(0))

In [32]:
for doors in distinct_doors:
    column_name = "Number_of_doors_" + str(doors)
    df = df.withColumn(column_name, when(df["Number of doors"] == doors, 1).otherwise(0))

In [33]:
# Assuming your DataFrame is called df
columns_to_drop = ["Offer from", "Condition", "Vehicle brand", "Vehicle model", "Year of production", "Fuel type", "Gearbox", "Body type", "Number of doors", "URL path", "ID", "Epoch"]

# Drop the specified columns
df = df.drop(*columns_to_drop)

In [34]:
df = df.filter(df["Price"].isNotNull())

In [35]:
df_pandas = df.toPandas()

23/06/21 23:01:57 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [36]:
df_pandas

Unnamed: 0,Price,Mileage,Power,Offer_type_Osoby_prywatnej,Offer_type_Firmy,Condition_Używane,Condition_Nowe,Vehicle_brand_Infiniti,Vehicle_brand_Lexus,Vehicle_brand_Jaguar,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
0,28000.0,189347.0,140,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,92900.0,81240.0,182,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,89000.0,86500.0,155,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,48900.0,218350.0,150,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,7500.0,245000.0,90,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33192,259000.0,3000.0,252,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
33193,316900.0,1.0,252,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
33194,340700.0,1.0,252,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
33195,359283.0,6200.0,292,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [37]:
df_pandas.isnull().sum()

Price                         0
Mileage                       0
Power                         0
Offer_type_Osoby_prywatnej    0
Offer_type_Firmy              0
                             ..
Number_of_doors_4.0           0
Number_of_doors_3.0           0
Number_of_doors_2.0           0
Number_of_doors_5.0           0
Number_of_doors_6.0           0
Length: 129, dtype: int64

In [38]:
df_pandas.to_csv('/home/konradballegro/notebooks/outputs/data/offers.csv')

In [39]:
y = df_pandas["Price"]
y

0         28000.0
1         92900.0
2         89000.0
3         48900.0
4          7500.0
           ...   
33192    259000.0
33193    316900.0
33194    340700.0
33195    359283.0
33196    268000.0
Name: Price, Length: 33197, dtype: float32

In [40]:
X = df_pandas.loc[:, ~df_pandas.columns.isin(["Price"])]
X.head()

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Offer_type_Firmy,Condition_Używane,Condition_Nowe,Vehicle_brand_Infiniti,Vehicle_brand_Lexus,Vehicle_brand_Jaguar,Vehicle_brand_Maserati,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
0,189347.0,140,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,81240.0,182,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,86500.0,155,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,218350.0,150,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,245000.0,90,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [41]:
train_size = 0.8

In [42]:
from sklearn.model_selection import train_test_split

# Split data into train and remaining data
X_train, X_remain, y_train, y_remain = train_test_split(X, y, test_size=0.3, random_state=42)

# Split remaining data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_remain, y_remain, test_size=0.5, random_state=42)


In [43]:
X_train_copy = X_train.copy()

In [44]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (23237, 128)
Shape of X_val: (4980, 128)
Shape of X_test: (4980, 128)
Shape of y_train: (23237,)
Shape of y_val: (4980,)
Shape of y_test: (4980,)


In [45]:
corr_matrix = X_train.corr(method = "spearman").abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_train = X_train.drop(to_drop, axis = 1)
X_val = X_val.drop(to_drop, axis = 1)
X_test = X_test.drop(to_drop, axis = 1)

In [46]:
X_train

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Condition_Używane,Vehicle_brand_Infiniti,Vehicle_brand_Lexus,Vehicle_brand_Jaguar,Vehicle_brand_Maserati,Vehicle_brand_Jeep,Vehicle_brand_Lancia,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
15888,121936.0,163,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
27697,44000.0,190,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1399,83500.0,125,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
32962,1.0,401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23860,150000.0,241,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,81483.0,150,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6265,135400.0,238,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11284,62000.0,102,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
860,5.0,105,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [47]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame containing the data
subset = X_train[["Mileage", "Power"]]

scaler = StandardScaler()
scaler.fit(subset)
subset_scaled = scaler.transform(subset)

X_train["Mileage"] = subset_scaled[:, 0]
X_train["Power"] = subset_scaled[:, 1]

In [48]:
X_train

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Condition_Używane,Vehicle_brand_Infiniti,Vehicle_brand_Lexus,Vehicle_brand_Jaguar,Vehicle_brand_Maserati,Vehicle_brand_Jeep,Vehicle_brand_Lancia,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
15888,0.471247,-0.058026,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
27697,-0.449033,0.246575,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1399,0.017389,-0.486724,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
32962,-0.968580,2.626977,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23860,0.802631,0.821933,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,-0.006428,-0.204686,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6265,0.630232,0.788089,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11284,-0.236487,-0.746199,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
860,-0.968533,-0.712354,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


neg_mean_absolute_error

In [49]:
selected_features = [
    'Mileage',
    'Power',
    'Offer_type_Osoby_prywatnej',
    'Condition_Używane',
    'Vehicle_brand_Infiniti',
    'Vehicle_brand_Lexus',
    'Vehicle_brand_Jaguar',
    'Vehicle_brand_Maserati',
    'Vehicle_brand_Jeep',
    'Vehicle_brand_Lancia',
    'Vehicle_brand_Kia',
    'Vehicle_brand_Hyundai',
    'Vehicle_brand_Honda',
    'Vehicle_brand_Lamborghini',
    'Vehicle_brand_Ligier',
    'Vehicle_brand_Isuzu',
    'Vehicle_brand_Land_Rover',
    'Vehicle_brand_Mercedes-Benz',
    'Vehicle_brand_McLaren',
    'Gearbox_Manualna',
    'Number_of_doors_6.0'
]

In [50]:
selected_features

['Mileage',
 'Power',
 'Offer_type_Osoby_prywatnej',
 'Condition_Używane',
 'Vehicle_brand_Infiniti',
 'Vehicle_brand_Lexus',
 'Vehicle_brand_Jaguar',
 'Vehicle_brand_Maserati',
 'Vehicle_brand_Jeep',
 'Vehicle_brand_Lancia',
 'Vehicle_brand_Kia',
 'Vehicle_brand_Hyundai',
 'Vehicle_brand_Honda',
 'Vehicle_brand_Lamborghini',
 'Vehicle_brand_Ligier',
 'Vehicle_brand_Isuzu',
 'Vehicle_brand_Land_Rover',
 'Vehicle_brand_Mercedes-Benz',
 'Vehicle_brand_McLaren',
 'Gearbox_Manualna',
 'Number_of_doors_6.0']

In [51]:
X_train = X_train[selected_features]
X_val = X_val[selected_features]
X_test = X_test[selected_features]

In [52]:
X_train

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Condition_Używane,Vehicle_brand_Infiniti,Vehicle_brand_Lexus,Vehicle_brand_Jaguar,Vehicle_brand_Maserati,Vehicle_brand_Jeep,Vehicle_brand_Lancia,...,Vehicle_brand_Hyundai,Vehicle_brand_Honda,Vehicle_brand_Lamborghini,Vehicle_brand_Ligier,Vehicle_brand_Isuzu,Vehicle_brand_Land_Rover,Vehicle_brand_Mercedes-Benz,Vehicle_brand_McLaren,Gearbox_Manualna,Number_of_doors_6.0
15888,0.471247,-0.058026,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27697,-0.449033,0.246575,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1399,0.017389,-0.486724,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
32962,-0.968580,2.626977,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23860,0.802631,0.821933,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,-0.006428,-0.204686,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6265,0.630232,0.788089,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11284,-0.236487,-0.746199,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
860,-0.968533,-0.712354,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [53]:
regressors = {}
regressors.update({"Linear Regression": LinearRegression(n_jobs=-1)})
regressors.update({"Ridge": Ridge()})
regressors.update({"Lasso": Lasso()})
regressors.update({"ElasticNet": ElasticNet()})
regressors.update({"Decision Tree": DecisionTreeRegressor()})
regressors.update({"Random Forest": RandomForestRegressor(n_jobs=-1)})
regressors.update({"Gradient Boosting": GradientBoostingRegressor()})
regressors.update({"AdaBoost": AdaBoostRegressor()})
regressors.update({"Extra Trees": ExtraTreesRegressor(n_jobs=-1)})
regressors.update({"XGBoost": XGBRegressor(n_jobs=-1)})

In [54]:
parameters = {}

parameters.update({"Linear Regression": {}})

parameters.update({"Ridge": {"alpha": (1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0)}})

parameters.update({"Lasso": {"alpha": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0]}})

parameters.update({"ElasticNet": {"alpha": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.25, 0.50, 0.75, 1.0],
                                  "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]}})

parameters.update({"Decision Tree": {"max_depth": [None, 5, 10, 20, 50, 100],
                                     "min_samples_split": [2, 5, 10],
                                     "min_samples_leaf": [1, 2, 4]}})

# parameters.update({"Random Forest": {"n_estimators": [50, 100, 200],
#                                      "max_depth": [None, 5, 10, 20, 50, 100],
#                                      "min_samples_split": [2, 5, 10],
#                                      "min_samples_leaf": [1, 2, 4],
#                                      "max_features": [0.5, 0.8, 1.0],
#                                      "bootstrap": [True, False],
#                                      "oob_score": [True, False]}})

parameters.update({"Random Forest": {"n_estimators": [100],
                                     "max_depth": [7],
                                     "min_samples_split": [0.05],
                                     "min_samples_leaf": [0.1],
#                                      "max_features": [0.5, 0.8, 1.0],
                                     "bootstrap": [True],
                                     "oob_score": [True]}})

# parameters.update({"Gradient Boosting": {"learning_rate": [0.001, 0.01, 0.1],
#                                          "n_estimators": [50, 100, 200],
#                                          "max_depth": [3, 5, 7],
#                                          "min_samples_split": [2, 5, 10],
#                                          "min_samples_leaf": [1, 2, 4],
#                                          "subsample": [0.5, 0.8, 1.0],
# #                                          "max_features": [0.5, 0.8, 1.0]}
#                   })

parameters.update({"Gradient Boosting": {"learning_rate": [0.01],
                                         "n_estimators": [100],
                                         "max_depth": [7],
                                         "min_samples_split": [2],
                                         "min_samples_leaf": [2, 4],
                                         "subsample": [0.5, 0.8],
#                                          "max_features": [0.5, 0.8, 1.0]
                  }})

# parameters.update({"AdaBoost": {"estimator": [DecisionTreeRegressor(max_depth=depth) for depth in range(1, 6)],
#                                 "n_estimators": [50, 100, 200],
#                                 "learning_rate": [0.001, 0.01, 0.1, 1.0],
#                                 "loss": ["linear", "square", "exponential"]}})


parameters.update({"AdaBoost": {"n_estimators": [50, 100],
                                "learning_rate": [0.01, 0.1],
                                "loss": ["linear", "square", "exponential"]}})

# parameters.update({"Extra Trees": {"n_estimators": [50, 100, 200],
#                                    "max_depth": [None, 5, 10, 20, 50, 100],
#                                    "min_samples_split": [2, 5, 10],
#                                    "min_samples_leaf": [1, 2, 4],
# #                                    "max_features": [0.5, 0.8, 1.0],
#                                    "bootstrap": [True, False],
#                                    "oob_score": [True, False]}})

parameters.update({"Extra Trees": {"n_estimators": [50, 100],
                                   "max_depth": [None, 5, 10, 20],
                                   "min_samples_split": [2, 5, 10],
                                   "min_samples_leaf": [2, 4],
#                                    "max_features": [0.5, 0.8, 1.0],
                                   "bootstrap": [True],
                                   "oob_score": [True]}})

# parameters.update({"XGBoost": {"n_estimators": [50, 100, 200],
#                                "max_depth": [3, 5, 7],
#                                "learning_rate": [0.001, 0.01, 0.1],
#                                "subsample": [0.5, 0.8, 1.0],
#                                "colsample_bytree": [0.5, 0.8, 1.0]}})

parameters.update({"XGBoost": {"n_estimators": [50, 100],
                               "max_depth": [3, 5, 7],
                               "learning_rate": [0.01, 0.1],
                               "subsample": [0.5, 0.8],
                               "colsample_bytree": [0.5, 0.8]}})

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [65]:
results = {}

for regressor_label, regressor in regressors.items():

    print("\n" + f"Teraz trenuje: {regressor_label}")

    steps = [("regressor", regressor)]

    pipeline = Pipeline(steps=steps, verbose=1)

    param_grid = parameters[regressor_label]  # Access the parameter grid correctly

    gscv = GridSearchCV(regressor, param_grid, cv=5, n_jobs=-1, verbose=1, scoring="explained_variance")

    gscv.fit(X_train, np.ravel(y_train))  

    best_params = gscv.best_params_
    best_score = gscv.best_score_

    regressor.set_params(**best_params)

    y_pred = gscv.predict(X_val)

    scoring = explained_variance_score(y_val, y_pred)

    result = {
        "Regressor": gscv,
        "Best Parameters": best_params,
        "Train": best_score,
        "Val": scoring
    }

    print("Val:", '{:.4f}'.format(result["Val"]))

    results.update({regressor_label: result})



Teraz trenuje: Linear Regression
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Val: -275903341.8583

Teraz trenuje: Ridge
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Val: -275903341.9980

Teraz trenuje: Lasso
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Val: -275903341.8675

Teraz trenuje: ElasticNet
Fitting 5 folds for each of 55 candidates, totalling 275 fits
Val: -275903666.5101

Teraz trenuje: Decision Tree
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Val: 0.0173

Teraz trenuje: Random Forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Val: -0.0000

Teraz trenuje: Gradient Boosting
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Val: 0.0628

Teraz trenuje: AdaBoost
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Val: 0.0889

Teraz trenuje: Extra Trees
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Val: 0.3285

Teraz trenuje: XGBoost
Fitting 5 folds for each of 48 c

### Extra Trees

In [77]:
results['Extra Trees']['Best Parameters']

{'bootstrap': True,
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50,
 'oob_score': True}

In [69]:
# Create the Extra Trees regressor
et_regressor = ExtraTreesRegressor(**{'bootstrap': True,
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50,
 'oob_score': True})

# Train the model
et_regressor.fit(X_train, y_train)

In [70]:
import pickle

# Saving with pickle
with open('/home/konradballegro/notebooks/outputs/models/et_model.pkl', 'wb') as file:
    pickle.dump(et_regressor, file)

# Saving with joblib
import joblib

joblib.dump(et_regressor, 'et_model.joblib')

['et_model.joblib']

In [71]:
import pickle

# Load the model from pickle file
with open('/home/konradballegro/notebooks/outputs/models/et_model.pkl', 'rb') as file:
    et_model = pickle.load(file)

In [72]:
import joblib

# Load the model from joblib file
model = joblib.load('et_model.joblib')

In [73]:
# Use the loaded model for predictions
y_pred = model.predict(X_test)

In [75]:
# Evaluate the model using mean squared error
mse = explained_variance_score(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.3537829677025913


### XGB

In [None]:
results['XGBoost']['Best Parameters']

In [None]:
results['Gradient Boosting']['Best Parameters']

In [None]:
# Instantiate the GradientBoostingRegressor with the specified parameters
gb_regressor = GradientBoostingRegressor(learning_rate=0.01,
                                      max_depth=7,
                                      min_samples_leaf=2,
                                      min_samples_split=2,
                                      n_estimators=100,
                                      subsample=0.8)

# Train the model
gb_regressor.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = gb_regressor.predict(X_val)

# Evaluate the model using mean squared error
mean_squared_error = mean_squared_error(y_val, y_pred)
print("mean_squared_error:", mean_squared_error)

In [None]:
import pickle

# Saving with pickle
with open('/home/konradballegro/notebooks/outputs/models/gb_model.pkl', 'wb') as file:
    pickle.dump(gb_regressor, file)

# Saving with joblib
import joblib

joblib.dump(gb_regressor, 'gb_model.joblib')

In [None]:
import pickle

# Load the model from pickle file
with open('/home/konradballegro/notebooks/outputs/models/gb_model.pkl', 'rb') as file:
    gb_model = pickle.load(file)

In [None]:
import joblib

# Load the model from joblib file
model = joblib.load('gb_model.joblib')

In [None]:
# Use the loaded model for predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

### XGBoost

In [None]:
# Create an instance of the XGBoost Regressor
xgb_regressor = XGBRegressor(**results['XGBoost']['Best Parameters'])

# Fit the regressor to the training data
xgb_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_regressor.predict(X_val)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error:", mse)


In [None]:
xgb_regressor.save_model('/home/konradballegro/notebooks/outputs/models/xgb_model.model')

In [None]:
model = XGBRegressor()
model.load_model('/home/konradballegro/notebooks/outputs/models/xgb_model.model')

In [None]:
# Use the loaded model for predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)