https://www.kdnuggets.com/2022/03/loss-functions-explainer.html

https://machinelearningmastery.com/rfe-feature-selection-in-python/

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [1]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, when

In [2]:
import numpy as np
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from IPython.display import display, HTML
import warnings

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

In [74]:
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [7]:
display(HTML("<style>.container { width:100% !important; }</style>"))
warnings.filterwarnings('ignore')
# pd.options.display.float_format = "{:.2f}".format

In [8]:
pyspark.__version__

'3.3.2'

In [9]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('regression') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/21 13:05:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "True") \
    .csv("/home/konradballegro/scripts/scraper/outputs/data/offers.csv")

                                                                                

In [11]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  129589
Number of columns:  230


In [12]:
headers = df.columns
for h, head in enumerate(headers):
    print(f"{h}: {head}")

0: Offer from
1: Category
2: Show offers with VIN number
3: Has registration number
4: Vehicle brand
5: Vehicle model
6: Version
7: Generation
8: Year of production
9: Mileage
10: Engine capacity
11: Fuel type
12: Power
13: Gearbox
14: Range
15: Drive
16: Battery capacity
17: Battery ownership type
18: CO2 emissions
19: Particulate filter
20: City fuel consumption
21: Body type
22: Number of doors
23: Number of seats
24: Color
25: Metallic
26: Color type
27: Right-hand drive (Anglik)
28: Country of origin
29: Leasing
30: VAT margin
31: VAT invoice
32: Manufacturer warranty period
33: Financing possibility
34: First registration
35: Registered in Poland
36: First owner
37: Accident-free
38: Serviced at authorized service center
39: Condition
40: ABS
41: Apple CarPlay
42: Android Auto
43: Rear side airbags43
44: Driver side airbag
45: CD
46: Central locking
47: Electric front windows
48: Electrically adjustable passenger seat
49: Electrically adjustable mirrors
50: Immobilizer
51: Driver

In [13]:
# Sample 20% of the rows without replacement
sampled_df = df.sample(withReplacement=False, fraction=0.2)

In [14]:
# Count the number of rows in the DataFrame
num_rows = sampled_df.count()

# Count the number of columns in the DataFrame
num_cols = len(sampled_df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  25944
Number of columns:  230


In [15]:
price = sampled_df.select("Price").rdd.flatMap(lambda x: x).collect()

                                                                                

In [16]:
# # Assuming you have a PySpark DataFrame called 'sampled_df' with a column 'Price'
# # Extract the 'Price' column as a list
# price_data = sampled_df.select("Price").rdd.flatMap(lambda x: x).collect()

# # Create a histogram using Seaborn
# sns.histplot(data=price_data, bins=10, kde=True)

# # Customize the plot
# plt.xlabel("Price")
# plt.ylabel("Frequency")
# plt.title("Histogram of Prices")

# # Show the plot
# plt.show()


In [17]:
df = df.filter((df["Currency"] == "PLN") &
                        (df["Country of origin"] == "Polska") &
                        (df["Accident-free"].isNotNull()) &
                        (df["Price"].isNotNull()) &
                        (df["Offer from"].isNotNull()) &
                        (df["Condition"].isNotNull()) &
                        (df["Vehicle brand"].isNotNull()) &
                        (df["Vehicle model"].isNotNull()) &
                        (df["Year of production"].isNotNull()) &
                        (df["Mileage"].isNotNull()) &
                        (df["Fuel type"].isNotNull()) &
                        (df["Power"].isNotNull()) &
                        (df["Gearbox"].isNotNull()) &
                        (df["Body type"].isNotNull()) &
                        (df["Number of doors"].isNotNull())
                       ).select(col("Price").cast("float").alias("Price"),
                                "Offer from",
                                "Condition",
                                "Vehicle brand",
                                "Vehicle model",
                                col("Year of production").cast("string").alias("Year of production"),
                                regexp_replace(regexp_replace(col("Mileage"), " ", ""), "km", "").cast("float").alias("Mileage"),
                                "Fuel type",
                                regexp_replace(regexp_replace(col("Power"), " ", ""), "KM", "").cast("integer").alias("Power"),
                                "Gearbox",
                                "Body type",
                                "Number of doors",
                                "URL path",
                                "ID",
                                "Epoch"
                               )

In [18]:
# Count the number of rows in the DataFrame
num_rows = df.count()

# Count the number of columns in the DataFrame
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)



Number of rows:  33225
Number of columns:  15


                                                                                

In [19]:
df.show()

+--------+---------------+---------+-------------+-------------+------------------+--------+-----------+-----+------------+-------------+---------------+--------------------+--------------------+----------+
|   Price|     Offer from|Condition|Vehicle brand|Vehicle model|Year of production| Mileage|  Fuel type|Power|     Gearbox|    Body type|Number of doors|            URL path|                  ID|     Epoch|
+--------+---------------+---------+-------------+-------------+------------------+--------+-----------+-----+------------+-------------+---------------+--------------------+--------------------+----------+
| 28000.0|Osoby prywatnej|  Używane|        Honda|        Civic|              2010|189347.0|    Benzyna|  140|    Manualna|        Sedan|            4.0|https://www.otomo...|000d1349f23a1d685...|1687241613|
| 92900.0|Osoby prywatnej|  Używane|        Honda|        Civic|              2018| 81240.0|    Benzyna|  182|    Manualna|      Kompakt|            5.0|https://www.otomo..

In [20]:
distinct_offers = df.select("Offer from").distinct().rdd.flatMap(lambda x: x).collect()
distinct_conditions = df.select("Condition").distinct().rdd.flatMap(lambda x: x).collect()
distinct_brands = df.select("Vehicle brand").distinct().rdd.flatMap(lambda x: x).collect()
distinct_models = df.select("Vehicle model").distinct().rdd.flatMap(lambda x: x).collect()
distinct_years = df.select("Year of production").distinct().rdd.flatMap(lambda x: x).collect()
distinct_fuel = df.select("Fuel type").distinct().rdd.flatMap(lambda x: x).collect()
distinct_gearbox = df.select("Gearbox").distinct().rdd.flatMap(lambda x: x).collect()
distinct_body = df.select("Body type").distinct().rdd.flatMap(lambda x: x).collect()
distinct_doors = df.select("Number of doors").distinct().rdd.flatMap(lambda x: x).collect()



In [21]:
for offer in distinct_offers:
    column_name = "Offer_type_" + offer.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Offer from"] == offer, 1).otherwise(0))

In [22]:
for condition in distinct_conditions:
    column_name = "Condition_" + condition.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Condition"] == condition, 1).otherwise(0))

In [23]:
for model in distinct_models:
    column_name = "Vehicle_model_" + model.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Vehicle model"] == model, 1).otherwise(0))

In [24]:
for year in distinct_years:
    column_name = "Year_of_production_" + str(year)
    df = df.withColumn(column_name, when(df["Year of production"] == year, 1).otherwise(0))

In [25]:
for fuel in distinct_fuel:
    column_name = "Fuel_type_" + fuel.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Fuel type"] == fuel, 1).otherwise(0))

In [26]:
for gearbox in distinct_gearbox:
    column_name = "Gearbox_" + gearbox.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Gearbox"] == gearbox, 1).otherwise(0))

In [27]:
for body in distinct_body:
    column_name = "Body_type_" + body.replace(" ", "_")
    df = df.withColumn(column_name, when(df["Body type"] == body, 1).otherwise(0))

In [28]:
for doors in distinct_doors:
    column_name = "Number_of_doors_" + str(doors)
    df = df.withColumn(column_name, when(df["Number of doors"] == doors, 1).otherwise(0))

In [29]:
# Assuming your DataFrame is called df
columns_to_drop = ["Offer from", "Condition", "Vehicle brand", "Vehicle model", "Year of production", "Fuel type", "Gearbox", "Body type", "Number of doors", "URL path", "ID", "Epoch"]

# Drop the specified columns
df = df.drop(*columns_to_drop)

In [30]:
df = df.filter(df["Price"].isNotNull())

In [31]:
df_pandas = df.toPandas()

23/06/21 13:07:21 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [32]:
df_pandas

Unnamed: 0,Price,Mileage,Power,Offer_type_Osoby_prywatnej,Offer_type_Firmy,Condition_Używane,Condition_Nowe,Vehicle_model_Viano,Vehicle_model_SC,Vehicle_model_W201_(190),...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
0,28000.0,189347.0,140,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,92900.0,81240.0,182,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,89000.0,86500.0,155,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,48900.0,218350.0,150,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,7500.0,245000.0,90,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33192,259000.0,3000.0,252,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
33193,316900.0,1.0,252,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
33194,340700.0,1.0,252,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
33195,359283.0,6200.0,292,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [33]:
df_pandas.isnull().sum()

Price                         0
Mileage                       0
Power                         0
Offer_type_Osoby_prywatnej    0
Offer_type_Firmy              0
                             ..
Number_of_doors_4.0           0
Number_of_doors_3.0           0
Number_of_doors_2.0           0
Number_of_doors_5.0           0
Number_of_doors_6.0           0
Length: 550, dtype: int64

In [34]:
df_pandas.to_csv('/home/konradballegro/notebooks/outputs/data/offers.csv')

In [35]:
y = df_pandas["Price"]
y

0         28000.0
1         92900.0
2         89000.0
3         48900.0
4          7500.0
           ...   
33192    259000.0
33193    316900.0
33194    340700.0
33195    359283.0
33196    268000.0
Name: Price, Length: 33197, dtype: float32

In [36]:
X = df_pandas.loc[:, ~df_pandas.columns.isin(["Price"])]
X.head()

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Offer_type_Firmy,Condition_Używane,Condition_Nowe,Vehicle_model_Viano,Vehicle_model_SC,Vehicle_model_W201_(190),Vehicle_model_SL,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
0,189347.0,140,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,81240.0,182,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,86500.0,155,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,218350.0,150,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,245000.0,90,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [37]:
train_size = 0.8

In [38]:
from sklearn.model_selection import train_test_split

# Split data into train and remaining data
X_train, X_remain, y_train, y_remain = train_test_split(X, y, test_size=0.3, random_state=42)

# Split remaining data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_remain, y_remain, test_size=0.5, random_state=42)


In [39]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (23237, 549)
Shape of X_val: (4980, 549)
Shape of X_test: (4980, 549)
Shape of y_train: (23237,)
Shape of y_val: (4980,)
Shape of y_test: (4980,)


In [40]:
corr_matrix = X_train.corr(method = "spearman").abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_train = X_train.drop(to_drop, axis = 1)
X_val = X_val.drop(to_drop, axis = 1)
X_test = X_test.drop(to_drop, axis = 1)

In [41]:
X_train

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Condition_Używane,Vehicle_model_Viano,Vehicle_model_SC,Vehicle_model_W201_(190),Vehicle_model_SL,Vehicle_model_EQA,Vehicle_model_Gladiator,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
15888,121936.0,163,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
27697,44000.0,190,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1399,83500.0,125,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
32962,1.0,401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23860,150000.0,241,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,81483.0,150,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6265,135400.0,238,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11284,62000.0,102,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
860,5.0,105,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming df is your DataFrame containing the data
subset = X_train[["Mileage", "Power"]]

scaler = StandardScaler()
scaler.fit(subset)
subset_scaled = scaler.transform(subset)

X_train["Mileage"] = subset_scaled[:, 0]
X_train["Power"] = subset_scaled[:, 1]

In [43]:
X_train

Unnamed: 0,Mileage,Power,Offer_type_Osoby_prywatnej,Condition_Używane,Vehicle_model_Viano,Vehicle_model_SC,Vehicle_model_W201_(190),Vehicle_model_SL,Vehicle_model_EQA,Vehicle_model_Gladiator,...,Body_type_Coupe,Body_type_Minivan,Body_type_Kompakt,Body_type_Auta_miejskie,Body_type_Kombi,Number_of_doors_4.0,Number_of_doors_3.0,Number_of_doors_2.0,Number_of_doors_5.0,Number_of_doors_6.0
15888,0.471247,-0.058026,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
27697,-0.449033,0.246575,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1399,0.017389,-0.486724,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
32962,-0.968580,2.626977,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23860,0.802631,0.821933,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,-0.006428,-0.204686,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6265,0.630232,0.788089,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11284,-0.236487,-0.746199,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
860,-0.968533,-0.712354,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


neg_mean_absolute_error

In [None]:
regressors = {}
regressors.update({"Random Forest": RandomForestRegressor(n_jobs=-1)})
FEATURE_IMPORTANCE = {"Random Forest"}

parameters = {}
parameters.update({"Random Forest": { 
                                    "regressor__n_estimators": [100],
                                    "regressor__max_features": ["sqrt", "log2"],
                                    "regressor__max_depth" : [4, 5, 6, 7],
                                    "regressor__min_samples_split": [0.05, 0.10],
                                    "regressor__min_samples_leaf": [0.05, 0.10],
                                    "regressor__criterion" :["squared_error", "absolute_error"],
                                    "regressor__n_jobs": [-1],
                                    "verbose": [1]
                                     }})

selected_regressor = "Random Forest"
scoring = "neg_mean_absolute_error"

selected_regressor = selected_regressor
regressor = regressors[selected_regressor]

In [None]:
steps = [("regressor", regressor)]

In [None]:
pipeline = Pipeline(steps=steps)

param_grid = parameters[selected_regressor]

gscv = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring=scoring)

In [None]:
print(f"Now tuning {selected_regressor}")
start_time = time.time()
gscv.fit(X_train, np.ravel(y_train))
end_time = time.time()
elapsed_time = end_time - start_time

print("Grid Search CV Results:")
print("Best Score:", gscv.best_score_)
print("Best Parameters:", gscv.best_params_)
print(f"Elapsed Time: {elapsed_time} seconds")

### Champion model trained on Kaggle:

Grid Search CV Results:
Best Score: -42966.1561527922
Best Parameters: {'regressor__criterion': 'absolute_error', 'regressor__max_depth': 7, 'regressor__max_features': 'auto', 'regressor__min_samples_leaf': 0.05, 'regressor__min_samples_split': 0.1, 'regressor__n_estimators': 100, 'regressor__n_jobs': -1, 'verbose': 1}
Elapsed Time: 20688.335080623627 seconds

In [53]:
best_params = {'regressor__criterion': 'absolute_error', 'regressor__max_depth': 7, 'regressor__max_features': 'auto', 'regressor__min_samples_leaf': 0.05, 'regressor__min_samples_split': 0.1, 'regressor__n_estimators': 100, 'regressor__n_jobs': -1, 'verbose': 1}

In [54]:
best_score = -42966.1561527922

In [62]:
regressors = {}
regressors.update({"Random Forest": RandomForestRegressor(n_jobs=-1)})
FEATURE_IMPORTANCE = {"Random Forest"}

parameters = {}
parameters.update({"Random Forest": { 
                                    "regressor__n_estimators": [100],
                                    "regressor__max_depth" : [7],
                                    "regressor__min_samples_split": [0.05],
                                    "regressor__min_samples_leaf": [0.10],
                                    "regressor__criterion" :["absolute_error"],
                                    "regressor__n_jobs": [-1],
                                    "verbose": [1]
                                     }})

selected_regressor = "Random Forest"
scoring = "neg_mean_absolute_error"

selected_regressor = selected_regressor
regressor = regressors[selected_regressor]

In [63]:
steps = [("regressor", regressor)]

In [64]:
pipeline = Pipeline(steps=steps)

param_grid = parameters[selected_regressor]

gscv = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring=scoring)

In [65]:
print(f"Now tuning {selected_regressor}")
start_time = time.time()
gscv.fit(X_train, np.ravel(y_train))
end_time = time.time()
elapsed_time = end_time - start_time

print("Grid Search CV Results:")
print("Best Score:", gscv.best_score_)
print("Best Parameters:", gscv.best_params_)
print(f"Elapsed Time: {elapsed_time} seconds")

Now tuning Random Forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Pipeline] ......... (step 1 of 1) Processing regressor, total= 4.3min
[Pipeline] ......... (step 1 of 1) Processing regressor, total= 4.3min
[Pipeline] ......... (step 1 of 1) Processing regressor, total= 4.3min
[Pipeline] ......... (step 1 of 1) Processing regressor, total= 4.3min
[Pipeline] ......... (step 1 of 1) Processing regressor, total= 1.1min
[Pipeline] ......... (step 1 of 1) Processing regressor, total= 1.7min
Grid Search CV Results:
Best Score: -50094.271491257015
Best Parameters: {'regressor__criterion': 'absolute_error', 'regressor__max_depth': 7, 'regressor__min_samples_leaf': 0.1, 'regressor__min_samples_split': 0.05, 'regressor__n_estimators': 100, 'regressor__n_jobs': -1, 'verbose': 1}
Elapsed Time: 426.6415042877197 seconds


In [67]:
best_params = gscv.best_params_
best_score = gscv.best_score_

tuned_params = {item[11:]: best_params[item] for item in best_params if item.startswith('regressor__')}
regressor.set_params(**tuned_params)

In [68]:
print("Tuned Regressor:")
print(regressor)

Tuned Regressor:
RandomForestRegressor(criterion='absolute_error', max_depth=7,
                      min_samples_leaf=0.1, min_samples_split=0.05, n_jobs=-1)


In [81]:
class PipelineRFE(Pipeline):
    def fit(self, X, y=None, **fit_params):
        super(PipelineRFE, self).fit(X, y, **fit_params)
        self.feature_importances_ = self.steps[-1][-1].feature_importances_
        return self

steps = [("regressor", regressor)]
pipe = PipelineRFE(steps=steps)

feature_selector = RFECV(pipe, cv=5, step=1, scoring=scoring, verbose=1, n_jobs=-1)
feature_selector = RFE(pipe, n_features_to_select = 13, step = 50, verbose = 1)

In [82]:
print("Now performing feature selection:")
start_time = time.time()
feature_selector.fit(X_train, np.ravel(y_train))
end_time = time.time()
elapsed_time = end_time - start_time

Now performing feature selection:
Fitting estimator with 545 features.
Fitting estimator with 495 features.
Fitting estimator with 445 features.
Fitting estimator with 395 features.
Fitting estimator with 345 features.
Fitting estimator with 295 features.
Fitting estimator with 245 features.
Fitting estimator with 195 features.
Fitting estimator with 145 features.
Fitting estimator with 95 features.
Fitting estimator with 45 features.


In [83]:
print("Feature Selector Results:")
print("Optimal Number of Features:", feature_selector.n_features_)
print("Selected Features:", feature_selector.support_)
print(f"Elapsed Time: {elapsed_time} seconds")

Feature Selector Results:
Optimal Number of Features: 13
Selected Features: [ True  True False False False False False False False False  True  True
  True  True  True  True  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False

In [84]:
feature_names = X_train.columns
selected_features = feature_names[feature_selector.support_].tolist()
print("Selected Feature Names:", selected_features)

Selected Feature Names: ['Mileage', 'Power', 'Vehicle_model_GranCabrio', 'Vehicle_model_570_GT', 'Vehicle_model_FR-V', 'Vehicle_model_Delta', 'Vehicle_model_Q50', 'Vehicle_model_3', 'Vehicle_model_Cherokee', 'Vehicle_model_Wrangler', 'Vehicle_model_Optima', 'Vehicle_model_EQE', 'Gearbox_Manualna']


In [85]:
ranked_features = feature_names[feature_selector.ranking_].tolist()
print("Selected Features Ranked:", ranked_features)

Selected Features Ranked: ['Power', 'Power', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Power', 'Power', 'Power', 'Power', 'Power', 'Power', 'Power', 'Power', 'Power', 'Condition_Używane', 'Condition_Używane', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_prywatnej', 'Offer_type_Osoby_pryw