In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import joblib

df = pd.read_csv('synthetic_data.csv')

categorical_columns = [
    'Left Front Condition', 'Right Front Condition', 'Left Rear Condition',
    'Right Rear Condition', 'Battery Water Level', 'Battery Condition',
    'Battery Leak', 'Rust/Dent/ Damage Exterior', 'Oil Leak in Suspension',
    'Brake Fluid Level', 'Brake Condition Front', 'Brake Condition Rear',
    'Emergency Brake', 'Rust/Dent/Damage Engine', 'Engine Oil Condition',
    'Engine Oil Color', 'Brake Fluid Condition', 'Brake Fluid Color', 'Oil Leak in Engine'
]

# Initialize label encoders
label_encoders = {}
for column in categorical_columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le


X = df.drop(columns=['Total Costing'])  
y = df['Total Costing']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

final_model = GradientBoostingRegressor(
    subsample=0.9,
    n_estimators=180,
    min_samples_split=12,
    min_samples_leaf=19,
    max_depth=5,
    learning_rate=0.2,
    random_state=42
)

final_model.fit(X_train, y_train)


y_pred = final_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


joblib.dump(final_model, 'finalmodel1.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')


Mean Squared Error: 13407075.003519189


['label_encoders.joblib']