In [None]:
import sys
from pathlib import Path
# make src importable
project_root = Path('..').resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
print('Adjusted sys.path for src package')

In [25]:
# Prepare common output directories and helper to save artifacts
from pathlib import Path
from datetime import datetime
import pickle
processed_dir = Path('../data/processed')
models_dir = Path('../models')
processed_dir.mkdir(parents=True, exist_ok=True)
models_dir.mkdir(parents=True, exist_ok=True)
print('Ensured directories exist:', processed_dir, models_dir)
def save_artifact(obj, name, dir=models_dir, timestamp=True):
    """Save a python object as a pickle to `dir`.
    If timestamp=True will append YYYYmmdd_HHMMSS to filename."""
    from pathlib import Path
    import pickle
    from datetime import datetime
    d = Path(dir)
    d.mkdir(parents=True, exist_ok=True)
    if timestamp:
        ts = datetime.now().strftime('%Y%m%d_%H%M%S')
        fname = f"{name}_{ts}.pkl"
    else:
        fname = f"{name}.pkl"
    path = d / fname
    with open(path, 'wb') as f:
        pickle.dump(obj, f)
    print('Saved', name, 'to', path)
    return path

Ensured directories exist: ..\data\processed ..\models


In [37]:
from sklearn.metrics import mean_absolute_error
import pandas as pd
# Test set metrics
mae = mean_absolute_error(y_test, y_test_pred)
print(f"Mean Absolute Error (test): {mae}")
print(f"Root Mean Squared Error (test): {rmse}")
print(f"R^2 Score (test): {r2}")

# Feature importances (if available)
if hasattr(rf, 'feature_importances_'):
    importances = rf.feature_importances_
    cols = X_train.columns
    fi = pd.Series(importances, index=cols).sort_values(ascending=False)
    print('Top 10 feature importances:')
    print(fi.head(10))
else:
    print('Model has no feature_importances_ attribute')

Mean Absolute Error (test): 3280.520237132968
Root Mean Squared Error (test): 5386.125974694389
R^2 Score (test): 0.7774194251692264
Top 10 feature importances:
Age                        0.169199
Airbags                    0.149846
Fuel type                  0.111944
Mileage                    0.108348
Levy                       0.084782
Engine volume              0.072084
Model                      0.064080
Gear box type_Automatic    0.043992
Manufacturer               0.038184
Gear box type_Tiptronic    0.030100
dtype: float64


In [7]:
data = pd.read_csv('../data/raw/car_price_prediction.csv')
data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [8]:
processed_data = preprocessing_pipeline(data)
data = processed_data
print('Processed data shape:', data.shape)

Preprocessing started...
Initial shape: (19237, 18)
After dropping duplicates: (18924, 18)
Replacing categorical values...
After cleaning outliers: (16037, 18)
Feature engineering...
Dropping columns...
Final shape: (16037, 16)
Processed data shape: (16037, 16)


In [9]:
from pathlib import Path
out_dir = Path('../data/processed')
out_dir.mkdir(parents=True, exist_ok=True)
processed_path = out_dir / 'processed_car_price.csv'
data.to_csv(processed_path, index=False)
print('Saved processed data to', processed_path)

Saved processed data to ..\data\processed\processed_car_price.csv


In [10]:
data.head()

Unnamed: 0,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Wheel,Color,Airbags,Age
0,13328,1399,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005,6.0,Automatic,4x4,Left wheel,Silver,12,15
1,16621,1018,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000,6.0,Tiptronic,4x4,Left wheel,Black,8,14
2,8467,0,HONDA,FIT,Hatchback,No,Petrol,1.3,200000,4.0,Variator,Front,Right-hand drive,Black,2,19
3,3607,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966,4.0,Automatic,4x4,Left wheel,White,0,14
4,11726,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901,4.0,Automatic,Front,Left wheel,Silver,4,11


In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

one_hot_columns = ['Leather interior', 'Gear box type', 'Drive wheels', 'Wheel']

# data = pd.get_dummies(data, columns=one_hot_columns)

oh_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
oh_encoded_train = oh_encoder.fit_transform(data[one_hot_columns])

oh_encoded_columns = oh_encoder.get_feature_names_out(one_hot_columns)


In [12]:
oh_encoded_train_df = pd.DataFrame(oh_encoded_train, columns=oh_encoded_columns, index=data.index)


In [13]:
data = pd.concat([data, oh_encoded_train_df], axis=1)
data.drop(columns=one_hot_columns, inplace=True)

In [26]:
import pickle
from pathlib import Path
models_dir = Path('../models')
models_dir.mkdir(parents=True, exist_ok=True)
# Save the encoder for future use
with open(models_dir / 'one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(oh_encoder, f)
print('Saved one_hot_encoder to', models_dir / 'one_hot_encoder.pkl')

Saved one_hot_encoder to ..\models\one_hot_encoder.pkl


In [27]:
label_encode_columns = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Color']

label_encoders = {}
for column in label_encode_columns:
    label_encoder = LabelEncoder()
    data[column] = label_encoder.fit_transform(data[column])
    label_encoders[column] = label_encoder

In [28]:
import pickle
from pathlib import Path
# Ensure models_dir exists (safe if this cell is run standalone)
models_dir = Path('../models')
models_dir.mkdir(parents=True, exist_ok=True)
# Save the label encoders for future use
with open(models_dir / 'label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print('Saved label_encoders to', models_dir / 'label_encoders.pkl')

Saved label_encoders to ..\models\label_encoders.pkl


In [29]:
X = data.drop('Price', axis=1)
y = data['Price']

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

Train set: 13631 samples
Test set: 2406 samples


In [31]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Levy', 'Engine volume', 'Mileage', 'Age']

scaler = StandardScaler()
X_train[numerical_columns]= scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [32]:
# Save the scaler
with open(models_dir / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print('Saved scaler to', models_dir / 'scaler.pkl')

Saved scaler to ..\models\scaler.pkl


In [33]:
X_train

Unnamed: 0,Levy,Manufacturer,Model,Category,Fuel type,Engine volume,Mileage,Cylinders,Color,Airbags,...,Leather interior_Yes,Gear box type_Automatic,Gear box type_Manual,Gear box type_Tiptronic,Gear box type_Variator,Drive wheels_4x4,Drive wheels_Front,Drive wheels_Rear,Wheel_Left wheel,Wheel_Right-hand drive
10363,-1.253429,35,391,3,5,-0.891360,0.246311,4.0,12,4,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
16277,0.519755,20,541,9,5,-0.555948,-0.103803,4.0,2,12,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
6811,1.723769,38,1311,10,5,-0.891360,1.066891,4.0,7,10,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
14217,0.791205,20,698,10,1,0.617993,-0.491465,4.0,14,4,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
11844,0.576672,11,361,3,5,-0.220536,0.010083,4.0,14,8,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16106,1.533316,50,1024,4,1,0.953405,-0.661101,4.0,12,4,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
6437,0.421245,37,2,9,5,-0.220536,-0.743420,4.0,14,12,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1011,-1.253429,53,750,3,5,-0.555948,-0.387773,4.0,14,0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
18952,1.312215,25,1121,4,5,2.295052,-0.097798,6.0,7,10,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [47]:
from sklearn.metrics import mean_absolute_error
import pandas as pd
# Test set metrics
mae = mean_absolute_error(y_test, y_test_pred)
print(f"Mean Absolute Error (test): {mae}")
print(f"Root Mean Squared Error (test): {rmse}")
print(f"R^2 Score (test): {r2}")

# Feature importances (if available)
if hasattr(rf, 'feature_importances_'):
    importances = rf.feature_importances_
    cols = X_train.columns
    fi = pd.Series(importances, index=cols).sort_values(ascending=False)
    print('Top 10 feature importances:')
    print(fi.head(10))
else:
    print('Model has no feature_importances_ attribute')

Mean Absolute Error (test): 3280.520237132968
Root Mean Squared Error (test): 5386.125974694389
R^2 Score (test): 0.7774194251692264
Top 10 feature importances:
Age                        0.169199
Airbags                    0.149846
Fuel type                  0.111944
Mileage                    0.108348
Levy                       0.084782
Engine volume              0.072084
Model                      0.064080
Gear box type_Automatic    0.043992
Manufacturer               0.038184
Gear box type_Tiptronic    0.030100
dtype: float64


In [48]:
# Save the model for future use
with open(models_dir / 'model.pkl', 'wb') as f:
    pickle.dump(rf, f)
print('Saved model to', models_dir / 'model.pkl')

Saved model to ..\models\model.pkl
