# Imports

In [1]:
# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

# Basic imports
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

# GPU
from cuml.ensemble import RandomForestRegressor
from cuml.svm import SVR
from cuml.neighbors import KNeighborsRegressor
from cuml.linear_model import ElasticNet, LinearRegression
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('Data/cars-cleaned.csv', index_col='Id')

# Preprocessing

In [3]:
ordinal = [
    'Engine Volume'
    ]
nominal = [
    'Location', 'Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type',
    'Drive wheels', 'Doors', 'Wheel', 'Color', 'Interior color',
    'Interior material', 'Exchange', 'Technical inspection', 'Catalyst',
    'Steering Hydraulics', 'On-Board Computer', 'Air Conditioning',
    'Parking Control', 'Rear View Camera', 'Electric Side Mirros',
    'Climate Control', 'Cruise Control', 'Start-Stop System', 'Sunroof',
    'Heated Seats', 'Memory Seats', 'ABS', 'ESP', 'Central Locking',
    'Alarm System', 'Fog Lamp', 'Central Screen (Navigation)', 'AUX',
    'Bluetooth', 'Multifunction Steering Wheel', 'Rims', 'Spare Tyre',
    'Didabled Accessible', 'Undamaged', 'Clean History', 'Newly Imported',
    'Low Consumption', 'Limited edition', 'Unpainted', 'One owner',
    'Urgently', 'European', 'Perfect condition', 'For family', 'Pawnshop',
    'Third row'
    ]
numeric = [
    'Year', 'Mileage', 'Cylinders', 'Airbags'
    ]

In [4]:
eng_vol_order = sorted(df["Engine Volume"].unique())

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=[eng_vol_order]), ordinal),
        ('nominal', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal),
        ('numeric', StandardScaler(), numeric),
    ],
    remainder='passthrough'
)

# Split data into Train-Validation-Test sets
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.Model.to_numpy())

In [5]:
# Preprocess train data using the preprocessor
train_preprocessed = preprocessor.fit_transform(train).toarray()
X_train = train_preprocessed[:, :-1]
y_train = train_preprocessed[:, -1]

# Preprocess test data using the preprocessor
test_preprocessed = preprocessor.transform(test).toarray()
X_test = test_preprocessed[:, :-1]
y_test = test_preprocessed[:, -1]

# Cross Validation

In [7]:
# Create a list of model objects
models = [
    ('SVR', SVR()),
    ('Random Forest', RandomForestRegressor()),
    ('XGBoost', XGBRegressor(tree_method='gpu_hist')),
    ('Linear Regression', LinearRegression()),
    ('Elastic Net', ElasticNet()),
    ('K Neighbors', KNeighborsRegressor())
]

In [10]:
# Define the number of splits for K-Fold cross-validation
n_splits = 5 

results = {} 

for name, model in models:
    rmse_scores = [] 

    # Initialize K-Fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, val_idx in tqdm(kf.split(X_train), desc=f"Cross-Validation: {name}"):
        # Train the model on the train subset
        model.fit(X_train[train_idx], y_train[train_idx])

        # Predict
        y_pred = model.predict(X_train[val_idx])

        # Calculate Root Mean Squared Error (RMSE)
        rmse = np.sqrt(mean_squared_error(y_train[val_idx], y_pred))

        # Append rmse scores to the list
        rmse_scores.append(rmse)

    # Calculate mean MSE
    mean_rmse = np.mean(rmse_scores)

    results[name] = {
        'Root Mean Squared Error (RMSE)': mean_rmse
    }

Cross-Validation: SVR: 5it [00:06,  1.22s/it]

Cross-Validation: Random Forest: 5it [01:12, 14.46s/it]

Cross-Validation: XGBoost: 5it [00:03,  1.51it/s]

Cross-Validation: Linear Regression: 5it [00:00,  6.03it/s]

Cross-Validation: Elastic Net: 5it [00:00,  5.04it/s]

Cross-Validation: K Neighbors: 5it [00:00,  9.45it/s]


In [14]:
# Print results
for model_name, scores in results.items():
    print(f"Model: {model_name}")

    if 'Root Mean Squared Error (RMSE)' in scores:
        print(f"Root Mean Squared Error (RMSE): {scores['Root Mean Squared Error (RMSE)']:.4f}")
    else:
        print("No regression metric found in results.")

    print()

Model: SVR

Root Mean Squared Error (RMSE): 22776.1348



Model: Random Forest

Root Mean Squared Error (RMSE): 19492.0994



Model: XGBoost

Root Mean Squared Error (RMSE): 21401.2402



Model: Linear Regression

Root Mean Squared Error (RMSE): 18477.0167



Model: Elastic Net

Root Mean Squared Error (RMSE): 19992.2994



Model: K Neighbors

Root Mean Squared Error (RMSE): 18844.5579




# Retrain Linear Model, Predict on Test Set and Save Model

In [6]:
# Create a Linear Regression model
model = LinearRegression()

# Train the model on the entire training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Model RMSE on Test Set: {rmse:.4f}")

# Save the trained model to a file
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

Model RMSE on Test Set: 7718.9231
