In [179]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import IsolationForest

In [180]:
audi = pd.read_csv("datasets/audi_challenge.csv")
audi

Unnamed: 0,index,car description,price (euro),age (year),fuel type,transmission,bodystyle,car model,mileage (km)
0,0,"Audi A3 1.8 TFSI 160pk S tronic S-Line (Xenon,...",13650.0,9,Petrol,Automatic,Cabriolet,A3,115174
1,1,Audi A7 Sportback 55 TFSI 340pk Quattro Pro Li...,91800.0,1,Petrol,Automatic,Hatchback,A7,8527
2,2,Audi A3 1.6 TDI 110 PK S-Line | Xenon | Bang &...,26150.0,3,Diesel,Manual,Sedan,A3,82345
3,3,Audi A8 Lang 3.0 TDI 258pk Tiptronic Quattro P...,50900.0,4,Diesel,Automatic,Sedan,A8,144772
4,4,Audi TT 1.8 TFSI Coupe Pro Line -MMI+/Xenon/St...,37700.0,2,Petrol,Manual,Coupe,TT,16652
...,...,...,...,...,...,...,...,...,...
7917,7917,Audi A1 1.0 TFSI 70KW SPORTBACK S-TR,,4,Petrol,Automatic,Hatchback,A1,62170
7918,7918,Audi S4 Audi S4 4.2 V8 quattro Advance AUTM. S...,,14,Petrol,Automatic,Station wagon,S4,156805
7919,7919,"Audi A5 Sportback | Nu inclusief € 2.000,- Voo...",,1,Petrol,Automatic,Hatchback,A5,0
7920,7920,Audi A3 1.9 TDI Attraction Pro Line Business,,12,Diesel,Manual,Hatchback,A3,285491


In [181]:
#drop the collumn "index"
audi = audi.drop(columns=["index"])

#drop the collumn "car description"
audi = audi.drop(columns=["car description"])


In [182]:
audi_with_price = audi[audi['price (euro)'].notnull()]
audi_without_price = audi[audi['price (euro)'].isnull()]

In [183]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(
    audi_with_price, test_size=0.2, random_state=42)

In [184]:
# split the data into features and target variable
X_train = train_set.drop(columns=["price (euro)"])
y_train = train_set["price (euro)"].copy()

X_test = test_set.drop(columns=["price (euro)"])
y_test = test_set["price (euro)"].copy()

In [185]:
numerical_features = ['age (year)', 'mileage (km)']
target_variable_name = 'price (euro)'

outlier_bounds = {}

# Calculate IQR for numerical features in X_train
for feature in numerical_features:
    Q1 = X_train[feature].quantile(0.25)
    Q3 = X_train[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_bounds[feature] = {'Q1': Q1, 'Q3': Q3, 'IQR': IQR, 'lower_bound': lower_bound, 'upper_bound': upper_bound}
    print(f"Feature: {feature}\n  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}\n  Lower Bound: {lower_bound:.2f}, Upper Bound: {upper_bound:.2f}\n")

# Calculate IQR for the target variable y_train
Q1_target = y_train.quantile(0.25)
Q3_target = y_train.quantile(0.75)
IQR_target = Q3_target - Q1_target
lower_bound_target = Q1_target - 1.5 * IQR_target
upper_bound_target = Q3_target + 1.5 * IQR_target
outlier_bounds[target_variable_name] = {'Q1': Q1_target, 'Q3': Q3_target, 'IQR': IQR_target, 'lower_bound': lower_bound_target, 'upper_bound': upper_bound_target}
print(f"Target Variable: {target_variable_name}\n  Q1: {Q1_target:.2f}, Q3: {Q3_target:.2f}, IQR: {IQR_target:.2f}\n  Lower Bound: {lower_bound_target:.2f}, Upper Bound: {upper_bound_target:.2f}\n")

print("Calculated IQR and outlier bounds for all specified features and the target variable.")

X_train_treated = X_train.copy()
X_test_treated = X_test.copy()
y_train_treated = y_train.copy()

# Apply capping to numerical features in X_train and X_test
for feature in numerical_features:
    lower_bound = outlier_bounds[feature]['lower_bound']
    upper_bound = outlier_bounds[feature]['upper_bound']

    X_train_treated[feature] = np.clip(X_train_treated[feature], lower_bound, upper_bound)
    X_test_treated[feature] = np.clip(X_test_treated[feature], lower_bound, upper_bound)

# Apply capping to the target variable y_train
lower_bound_target = outlier_bounds[target_variable_name]['lower_bound']
upper_bound_target = outlier_bounds[target_variable_name]['upper_bound']

y_train_treated = np.clip(y_train_treated, lower_bound_target, upper_bound_target)

print("Applied capping to numerical features in X_train_treated, X_test_treated, and to y_train_treated.")

Feature: age (year)
  Q1: 1.00, Q3: 8.00, IQR: 7.00
  Lower Bound: -9.50, Upper Bound: 18.50

Feature: mileage (km)
  Q1: 11038.00, Q3: 143460.00, IQR: 132422.00
  Lower Bound: -187595.00, Upper Bound: 342093.00

Target Variable: price (euro)
  Q1: 15700.00, Q3: 40950.00, IQR: 25250.00
  Lower Bound: -22175.00, Upper Bound: 78825.00

Calculated IQR and outlier bounds for all specified features and the target variable.
Applied capping to numerical features in X_train_treated, X_test_treated, and to y_train_treated.


In [186]:
X_train

Unnamed: 0,age (year),fuel type,transmission,bodystyle,car model,mileage (km)
3289,7,Petrol,Manual,Hatchback,A1,94320
2077,5,Petrol,Manual,Hatchback,A3,58705
443,7,Diesel,Manual,Station wagon,A6,163348
2812,3,Petrol,Manual,Sedan,A4,13804
1554,1,Petrol,Automatic,Hatchback,A5,0
...,...,...,...,...,...,...
3772,1,Diesel,Manual,Hatchback,A3,0
5191,12,Diesel,Manual,Hatchback,A3,264114
5226,8,Petrol,Manual,Hatchback,A1,138000
5390,4,Petrol,Manual,Hatchback,A3,63352


In [187]:
X_train_treated

Unnamed: 0,age (year),fuel type,transmission,bodystyle,car model,mileage (km)
3289,7.0,Petrol,Manual,Hatchback,A1,94320
2077,5.0,Petrol,Manual,Hatchback,A3,58705
443,7.0,Diesel,Manual,Station wagon,A6,163348
2812,3.0,Petrol,Manual,Sedan,A4,13804
1554,1.0,Petrol,Automatic,Hatchback,A5,0
...,...,...,...,...,...,...
3772,1.0,Diesel,Manual,Hatchback,A3,0
5191,12.0,Diesel,Manual,Hatchback,A3,264114
5226,8.0,Petrol,Manual,Hatchback,A1,138000
5390,4.0,Petrol,Manual,Hatchback,A3,63352


# Preprocessing

In [188]:
# make preprocessing pipeline


num_attribs = ["age (year)", "mileage (km)"]
cat_attribs = ["fuel type", "transmission", "bodystyle", "car model"]

cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessing = ColumnTransformer([
    ("cat", cat_pipeline, cat_attribs),
    ("num", StandardScaler(), num_attribs),
])

# Hyperparameter tuning

In [189]:
from scipy.stats import randint, uniform
from xgboost import XGBRegressor

# Define the parameter distribution dictionary for XGBoost within the pipeline

param_distribs = {
    'xgbregressor__n_estimators': randint(low=100, high=1000),
    'xgbregressor__learning_rate': uniform(loc=0.01, scale=0.19),
    'xgbregressor__max_depth': randint(low=3, high=10),
    'xgbregressor__subsample': uniform(loc=0.6, scale=0.4),
    'xgbregressor__colsample_bytree': uniform(loc=0.6, scale=0.4),
}

# Create an XGBRegressor instance
xgb_base = XGBRegressor(random_state=42, n_jobs=-1) # Use n_jobs for parallel processing

# Create a pipeline including preprocessing and the XGBoost regressor
xgb_pipeline = make_pipeline(preprocessing, xgb_base)

# Instantiate RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_distribs,
    n_iter=50,  # Increased n_iter for a more thorough search
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    verbose=2, # To see the progress
    n_jobs=-1 # Use all available cores
)

# Fit RandomizedSearchCV to the training data
random_search_xgb.fit(X_train_treated, y_train_treated)

print("RandomizedSearchCV fit complete.")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomizedSearchCV fit complete.


In [190]:
print("Best parameters found: ", random_search_xgb.best_params_)
best_rmse_tuned_xgb = -random_search_xgb.best_score_
print("Best RMSE for tuned XGBoost (from RandomizedSearchCV): ", best_rmse_tuned_xgb)

Best parameters found:  {'xgbregressor__colsample_bytree': np.float64(0.7077649335194086), 'xgbregressor__learning_rate': np.float64(0.05638384922707709), 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 473, 'xgbregressor__subsample': np.float64(0.687505687829228)}
Best RMSE for tuned XGBoost (from RandomizedSearchCV):  4719.561060516477


# Model

In [191]:

# Alternatively, if you have different types of features, you can use ColumnTransformer
# numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns

In [192]:
audi_prepared = preprocessing.fit_transform(audi_with_price)


In [193]:

# Train a model using linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Step 2: Initialize the random forest regression model
# forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
# forest_rmses = -cross_val_score(forest_reg, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10)


In [194]:


# Initialize the XGBoost Regressor
xgb_reg = XGBRegressor(
    n_estimators=random_search_xgb.best_params_['xgbregressor__n_estimators'],
    learning_rate=random_search_xgb.best_params_['xgbregressor__learning_rate'],
    max_depth=random_search_xgb.best_params_['xgbregressor__max_depth'],
    subsample=random_search_xgb.best_params_['xgbregressor__subsample'],
    colsample_bytree=random_search_xgb.best_params_['xgbregressor__colsample_bytree'],
    random_state=42
)

xgb_reg = make_pipeline(preprocessing, xgb_reg)

xgb_scores = cross_val_score(xgb_reg, X_train_treated, y_train_treated, cv=10, scoring='neg_mean_squared_error')
xgb_rmse = np.sqrt(-xgb_scores)

In [195]:
pd.Series(xgb_rmse).describe()

count      10.000000
mean     4695.057705
std       153.744933
min      4469.568949
25%      4600.871874
50%      4677.429321
75%      4792.654133
max      4992.162725
dtype: float64

In [201]:
audi_without_price

Unnamed: 0,price (euro),age (year),fuel type,transmission,bodystyle,car model,mileage (km)
7122,,6,Petrol,Manual,Hatchback,A3,126638
7123,,2,Diesel,Manual,Hatchback,A3,7348
7124,,6,Petrol,Manual,Hatchback,A1,78036
7125,,2,Petrol,Automatic,SUV,Q3,27000
7126,,1,Petrol,Automatic,Hatchback,A5,0
...,...,...,...,...,...,...,...
7917,,4,Petrol,Automatic,Hatchback,A1,62170
7918,,14,Petrol,Automatic,Station wagon,S4,156805
7919,,1,Petrol,Automatic,Hatchback,A5,0
7920,,12,Diesel,Manual,Hatchback,A3,285491


In [200]:
# predict the prices for the audi_without_price dataset
X_missing = audi_without_price.drop(columns=["price (euro)"])
y_missing_pred = random_search_xgb.predict(X_missing)

y_missing_pred

# add the predicted prices to the "audi_pred_submission.csv" file, using "index,predicted price"
predictions_df = pd.DataFrame({
    'index': audi_without_price.index,
    'predicted price': y_missing_pred
})
predictions_df.to_csv("audi_pred_submission.csv", index=False)


In [202]:
predictions_df

Unnamed: 0,index,predicted price
0,7122,15800.064453
1,7123,28534.703125
2,7124,14555.946289
3,7125,39595.394531
4,7126,58064.324219
...,...,...
795,7917,22085.302734
796,7918,17243.316406
797,7919,58064.324219
798,7920,5018.868652
