In [1]:
import pickle
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv('Car details v3.csv')

# Data preprocessing
df.drop(['mileage', 'name','torque'], axis=1, inplace=True)
df.dropna(inplace=True)
df['age'] = 2024 - df['year']
df.drop('year', inplace=True, axis=1)

# Function to convert engine values to numerical
def convert_engine_to_num(x):
    x = str(x)
    tokens = x.split(' ')
    if len(tokens) == 2:
        return float(tokens[0])
    try:
        return float(x)
    except:
        return None

df.engine = df.engine.apply(convert_engine_to_num)

# Function to convert max power values to numerical
def convert_maxpowers_to_num(x):
        x = str(x)
        tokens = x.split(' ')
        try:
            for i in tokens:
                return float(i)
        except ValueError:
            pass

df.max_power = df.max_power.apply(convert_maxpowers_to_num)


# convert engine and max powers to numerical values
def convert_to_float(x):
    try:
        return float(x.split(' ')[0])
    except:
        return 0

df['engine'] = df['engine'].apply(convert_to_float)



# Encode categorical columns

categorical_cols = ['fuel', 'seller_type', 'transmission', 'owner']
df = pd.get_dummies(df, columns=categorical_cols)

print(df.info())

# Split data into features and target
x = df.drop('selling_price', axis=1)
y = df['selling_price']

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=40)


import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Convert your data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train the model
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=[(dtest, 'test')],
    early_stopping_rounds=10,
    verbose_eval=True
)

# Make predictions
y_pred = xgb_model.predict(dtest)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Save the model
xgb_model.save_model("xgb_car_price_model.json")




<class 'pandas.core.frame.DataFrame'>
Index: 7907 entries, 0 to 8127
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   selling_price                 7907 non-null   int64  
 1   km_driven                     7907 non-null   int64  
 2   engine                        7907 non-null   int64  
 3   max_power                     7906 non-null   float64
 4   seats                         7907 non-null   float64
 5   age                           7907 non-null   int64  
 6   fuel_CNG                      7907 non-null   bool   
 7   fuel_Diesel                   7907 non-null   bool   
 8   fuel_LPG                      7907 non-null   bool   
 9   fuel_Petrol                   7907 non-null   bool   
 10  seller_type_Dealer            7907 non-null   bool   
 11  seller_type_Individual        7907 non-null   bool   
 12  seller_type_Trustmark Dealer  7907 non-null   bool   
 13  transmis

Parameters: { "n_estimators" } are not used.



[27]	test-rmse:152038.75612
[28]	test-rmse:149239.76382
[29]	test-rmse:146584.71565
[30]	test-rmse:144140.67097
[31]	test-rmse:142699.27733
[32]	test-rmse:141345.33869
[33]	test-rmse:139743.82733
[34]	test-rmse:138827.53888
[35]	test-rmse:138052.87151
[36]	test-rmse:137290.62042
[37]	test-rmse:136693.60661
[38]	test-rmse:136482.54620
[39]	test-rmse:136179.19627
[40]	test-rmse:135618.26602
[41]	test-rmse:135238.37688
[42]	test-rmse:134785.85037
[43]	test-rmse:134300.72850
[44]	test-rmse:133914.02323
[45]	test-rmse:133587.46777
[46]	test-rmse:132913.80512
[47]	test-rmse:132556.12114
[48]	test-rmse:132333.53335
[49]	test-rmse:132318.17516
[50]	test-rmse:132297.77486
[51]	test-rmse:131776.63988
[52]	test-rmse:131530.01896
[53]	test-rmse:131222.64837
[54]	test-rmse:131045.16229
[55]	test-rmse:130928.27941
[56]	test-rmse:131175.66778
[57]	test-rmse:131080.83855
[58]	test-rmse:130823.98213
[59]	test-rmse:130819.14710
[60]	test-rmse:130871.68806
[61]	test-rmse:130677.37460
[62]	test-rmse:13064