In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import xgboost as xgb
import pickle as pk

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score

In [3]:
df = pd.read_csv(r'C:\Users\DSAI\Car Price - Car Price.csv')
df.head()

Unnamed: 0,Brand,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner
0,Maruti,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
df.isnull().sum()

Brand            0
Model            0
Year             0
Selling_Price    0
KM_Driven        0
Fuel             0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [5]:
df.shape

(4340, 9)

In [6]:
le = LabelEncoder()

In [7]:
columns = ['Brand', 'Model', 'Fuel','Seller_Type', 'Transmission', 'Owner']
label_encoder = {}

In [8]:
for column in columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoder[column] = le
df.head()

Unnamed: 0,Brand,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner
0,18,775,2007,60000,70000,4,1,1,0
1,18,1041,2007,135000,50000,4,1,1,0
2,10,505,2012,600000,100000,1,1,1,0
3,5,118,2017,250000,46000,4,1,1,0
4,9,279,2014,450000,141000,1,1,1,2


In [9]:
norm = MinMaxScaler()

In [10]:
columns1 = ['Brand', 'Model', 'Fuel','Seller_Type','Selling_Price', 'Transmission', 'Owner']

In [11]:
df.corr()

Unnamed: 0,Brand,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner
Brand,1.0,0.974513,-0.039724,-0.096858,0.131461,-0.109534,0.144218,0.110699,0.036452
Model,0.974513,1.0,-0.051902,-0.077598,0.126203,-0.083927,0.140802,0.087785,0.032819
Year,-0.039724,-0.051902,1.0,0.413922,-0.419688,-0.120002,-0.098352,-0.1438,-0.414705
Selling_Price,-0.096858,-0.077598,0.413922,1.0,-0.192289,-0.269653,-0.151554,-0.530205,-0.20784
KM_Driven,0.131461,0.126203,-0.419688,-0.192289,1.0,-0.286095,0.113689,0.120226,0.297115
Fuel,-0.109534,-0.083927,-0.120002,-0.269653,-0.286095,1.0,0.038797,0.039249,-0.01091
Seller_Type,0.144218,0.140802,-0.098352,-0.151554,0.113689,0.038797,1.0,0.174925,0.165681
Transmission,0.110699,0.087785,-0.1438,-0.530205,0.120226,0.039249,0.174925,1.0,0.078893
Owner,0.036452,0.032819,-0.414705,-0.20784,0.297115,-0.01091,0.165681,0.078893,1.0


In [12]:
X = df.drop(columns = ['Selling_Price'])
X.head()

Unnamed: 0,Brand,Model,Year,KM_Driven,Fuel,Seller_Type,Transmission,Owner
0,18,775,2007,70000,4,1,1,0
1,18,1041,2007,50000,4,1,1,0
2,10,505,2012,100000,1,1,1,0
3,5,118,2017,46000,4,1,1,0
4,9,279,2014,141000,1,1,1,2


In [13]:
y = df['Selling_Price']
y.head()

0     60000
1    135000
2    600000
3    250000
4    450000
Name: Selling_Price, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
mae = mean_absolute_error(y_test, y_pred)
mae

225957.33535216562

In [18]:
r2 = r2_score(y_test, y_pred)
r2

0.39854106712208637

In [19]:
lr_score = cross_val_score(model, X, y, cv = 5)
lr_score

array([0.47334368, 0.513227  , 0.47004209, 0.34905751, 0.36806792])

In [20]:
lr_score.mean()

0.4347476388509359

In [21]:
rg = Ridge(alpha = 10)
rg.fit(X_train, y_train)

In [22]:
y_rg = rg.predict(X_test)

In [23]:
print(f'alpha: 10, parameters of b1, b2, b3:{rg.coef_}, accuracy: {r2_score(y_test, y_rg)}')

alpha: 10, parameters of b1, b2, b3:[-3.49110838e+04  5.39596811e+02  3.83427341e+04 -9.97391487e-01
 -1.00056862e+05 -2.46669216e+04 -8.56535113e+05 -1.26855258e+04], accuracy: 0.40000464754035014


In [24]:
rg_score = cross_val_score(rg, X, y, cv = 5)
rg_score

array([0.47074051, 0.51139081, 0.47039853, 0.35691954, 0.36738808])

In [25]:
rg_score.mean()

0.43536749385454127

In [26]:
la = Lasso(alpha = 10)
la.fit(X_train, y_train)

In [27]:
y_la = la.predict(X_test)

In [28]:
print(f'alpha: 10, parameters of b1, b2, b3:{la.coef_}, accuracy: {r2_score(y_test, y_la)}')

alpha: 10, parameters of b1, b2, b3:[-3.42864900e+04  5.30254023e+02  3.81228326e+04 -9.85797843e-01
 -9.98541263e+04 -2.20904224e+04 -8.84589631e+05 -1.27991006e+04], accuracy: 0.39854999691012793


In [29]:
la_score = cross_val_score(la, X, y, cv = 5)
la_score

array([0.47332884, 0.51322127, 0.47004705, 0.34909134, 0.36806543])

In [30]:
la_score.mean()

0.4347507846354435

In [31]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [32]:
y_dt = dt.predict(X_test)

In [33]:
r2_dt = r2_score(y_test, y_dt)
r2_dt

0.534390092358564

In [34]:
la_score = cross_val_score(la, X, y, cv = 5)
la_score

array([0.47332884, 0.51322127, 0.47004705, 0.34909134, 0.36806543])

In [35]:
la_score.mean()

0.4347507846354435

In [36]:
rf = RandomForestRegressor(n_estimators = 500)
rf.fit(X_train, y_train)

In [37]:
y_rf = rf.predict(X_test)

In [38]:
r2_rf = r2_score(y_test, y_rf)
r2_rf

0.6950876411069291

In [39]:
rf_score = cross_val_score(rf, X, y, cv = 5)
rf_score

array([0.8369282 , 0.8591754 , 0.91274266, 0.78864168, 0.55117043])

In [40]:
rf_score.mean()

0.7897316724589976

In [41]:
svm = svm.SVR()
svm.fit(X_train, y_train)

In [42]:
y_svm = svm.predict(X_test)

In [43]:
r2_svm = r2_score(y_test, y_svm)
r2_svm

-0.0634369379946389

In [44]:
svm_score = cross_val_score(svm, X, y, cv = 5)
svm_score

array([-0.09025843, -0.07372722, -0.07032385, -0.02522146, -0.07452587])

In [45]:
svm_score.mean()

-0.0668113647055009

In [46]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

In [47]:
y_xgb = xgb_model.predict(X_test)

In [48]:
r2_xgb = r2_score(y_test, y_xgb)
r2_xgb

0.7606532078477761

In [49]:
xgb_score = cross_val_score(xgb_model, X, y, cv = 5)
xgb_score

array([0.89051994, 0.89555872, 0.942474  , 0.80642826, 0.58875804])

In [50]:
xgb_score.mean()

0.8247477921517901

In [51]:
with open('xgb_model.pkl', 'wb') as file:
    pk.dump(xgb_model, file)

In [54]:
df.head()

Unnamed: 0,Brand,Model,Year,Selling_Price,KM_Driven,Fuel,Seller_Type,Transmission,Owner
0,18,775,2007,60000,70000,4,1,1,0
1,18,1041,2007,135000,50000,4,1,1,0
2,10,505,2012,600000,100000,1,1,1,0
3,5,118,2017,250000,46000,4,1,1,0
4,9,279,2014,450000,141000,1,1,1,2


In [53]:
# Collect user input
brand = int(input("Enter Brand: "))
model = int(input("Enter Model: "))
year = int(input("Enter Year: "))
km_driven = float(input("Enter KM Driven: "))
fuel = int(input("Enter Fuel Type: "))
seller_type = input("Enter Seller Type: ")
transmission = input("Enter Transmission: ")
owner = str(input("Enter Owner: "))

Enter Brand: Maruti
Enter Model: Maruti 800 AC
Enter Year: 2007
Enter KM Driven: 70000
Enter Fuel Type: Petrol
Enter Seller Type: Individual
Enter Transmission: Manual
Enter Owner: First Owner


In [58]:
# Create a DataFrame for the input data
input_data = pd.DataFrame([{
    'Brand': brand,
    'Model': model,
    'Year': year,
    'KM_Driven': km_driven,
    'Fuel': fuel,
    'Seller_Type': seller_type,
    'Transmission': transmission,
    'Owner': owner
}])

In [56]:
predicted_price = xgb_model.predict(input_data)

print(f"Predicted Car Price: {predicted_price[0]}")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Brand: object, Model: object, Fuel: object, Seller_Type: object, Transmission: object, Owner: object

In [62]:
filename = 'xgb_model.pkl'
pk.dump(xgb_model, open(filename, 'wb'))

loaded_model = pk.load(open(filename, 'rb'))
input_data = [[18, 1041, 2007, 50000, 4, 1, 1, 0]]
prediction = loaded_model.predict(input_data)

print('Predictions:', prediction)

Predictions: [126687.32]
