In [42]:
import pandas as pd
import numpy as np
import sklearn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import * 
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
import pickle

In [43]:
df = pd.read_csv(r"D:\PYTHON1\Capstone Project\Car Details 2.csv")
df.head() 

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,brands,models
0,2007,60000.0,70000.0,Petrol,Individual,Manual,First Owner,Maruti,Maruti 800 AC
1,2007,135000.0,50000.0,Petrol,Individual,Manual,First Owner,Maruti,Maruti Wagon R
2,2012,600000.0,100000.0,Diesel,Individual,Manual,First Owner,Hyundai,Hyundai Verna 1.6
3,2017,250000.0,46000.0,Petrol,Individual,Manual,First Owner,Datsun,Other
4,2014,450000.0,141000.0,Diesel,Individual,Manual,Second Owner,Honda,Other


**Removing Irrelevant columns :-**

In [44]:
df['models'].value_counts() 

models
Other                    2565
Maruti Wagon R            139
Maruti Swift Dzire        120
Hyundai Grand i10          75
Maruti Alto 800            75
Toyota Innova 2.5          72
Hyundai Santro Xing        57
Maruti Alto LXi            57
Hyundai Verna 1.6          56
Tata Indica Vista          56
Maruti Swift VDI           55
Maruti Alto K10            48
Maruti Alto LX             38
Ford Figo Diesel           36
Hyundai EON Era            34
Chevrolet Beat Diesel      32
Maruti 800 AC              32
Ford EcoSport 1.5          30
Name: count, dtype: int64

- It seems like there is a large number of entries labeled as "Other," which could make this column less useful for certain types of analysis. 
- High frequency of "Other" category: With 2565 entries labeled as "Other," this category dominates the data, making it less useful for distinguishing between specific models.
- Long tail of model names: There are many models with very few entries, which might not provide statistically significant insights for each individual model.
- In summary, whether the models column is irrelevant depends on your specific analysis goals. If model-level granularity is not crucial, you might consider it less relevant, especially given the high frequency of the "Other" category.

In [45]:
df.drop('models', axis=1, inplace=True)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,brands
0,2007,60000.0,70000.0,Petrol,Individual,Manual,First Owner,Maruti
1,2007,135000.0,50000.0,Petrol,Individual,Manual,First Owner,Maruti
2,2012,600000.0,100000.0,Diesel,Individual,Manual,First Owner,Hyundai
3,2017,250000.0,46000.0,Petrol,Individual,Manual,First Owner,Datsun
4,2014,450000.0,141000.0,Diesel,Individual,Manual,Second Owner,Honda


#### **Data Preprocessing :-**

**1. Checking Null Values**

In [46]:
df.isna().sum().sum() 

0

**2. Checking Duplicates**

In [47]:
df.duplicated().sum() 

27

In [48]:
df = df.drop_duplicates() 
df.duplicated().sum() 

0

In [49]:
df.shape

(3550, 8)

**3. Checking the Data types**

In [50]:
df.dtypes

year               int64
selling_price    float64
km_driven        float64
fuel              object
seller_type       object
transmission      object
owner             object
brands            object
dtype: object

In [51]:
df['fuel'].value_counts()

fuel
Diesel      1789
Petrol      1701
CNG           37
LPG           22
Electric       1
Name: count, dtype: int64

In [52]:
df['seller_type'].value_counts() 

seller_type
Individual          2805
Dealer               712
Trustmark Dealer      33
Name: count, dtype: int64

In [53]:
df['transmission'].value_counts() 

transmission
Manual       3238
Automatic     312
Name: count, dtype: int64

In [54]:
df['owner'].value_counts() 

owner
First Owner             2199
Second Owner             970
Third Owner              289
Fourth & Above Owner      75
Test Drive Car            17
Name: count, dtype: int64

In [55]:
df['brands'].value_counts() 

brands
Maruti           1057
Hyundai           631
Mahindra          324
Tata              308
Ford              220
Honda             216
Toyota            170
Chevrolet         151
Renault           108
Volkswagen         93
Nissan             52
Skoda              49
Others             33
Fiat               32
Audi               31
Datsun             29
BMW                25
Mercedes-Benz      21
Name: count, dtype: int64

**4. Separating Categorical and Numerical Features**

In [56]:
cat_cols = df.dtypes[df.dtypes == 'object'].index
num_cols = df.dtypes[df.dtypes != 'object'].index
print(cat_cols)
print(num_cols) 

Index(['fuel', 'seller_type', 'transmission', 'owner', 'brands'], dtype='object')
Index(['year', 'selling_price', 'km_driven'], dtype='object')


**5. Encoding Object Data Type Columns**

In [57]:
lb = LabelEncoder()

In [58]:
for i in cat_cols:
    df[i] = lb.fit_transform(df[i])

In [59]:
df.dtypes 

year               int64
selling_price    float64
km_driven        float64
fuel               int32
seller_type        int32
transmission       int32
owner              int32
brands             int32
dtype: object

In [60]:
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,brands
0,2007,60000.0,70000.0,4,1,1,0,9
1,2007,135000.0,50000.0,4,1,1,0,9
2,2012,600000.0,100000.0,1,1,1,0,7
3,2017,250000.0,46000.0,4,1,1,0,3
4,2014,450000.0,141000.0,1,1,1,2,6


In [61]:
df['fuel'].value_counts()

fuel
1    1789
4    1701
0      37
3      22
2       1
Name: count, dtype: int64

In [62]:
df['seller_type'].value_counts()

seller_type
1    2805
0     712
2      33
Name: count, dtype: int64

In [63]:
df['transmission'].value_counts()

transmission
1    3238
0     312
Name: count, dtype: int64

In [64]:
df['owner'].value_counts() 

owner
0    2199
2     970
4     289
1      75
3      17
Name: count, dtype: int64

In [65]:
df['brands'].value_counts()

brands
9     1057
7      631
8      324
15     308
5      220
6      216
16     170
2      151
13     108
17      93
11      52
14      49
12      33
4       32
0       31
3       29
1       25
10      21
Name: count, dtype: int64

**6. Separating Data into x(independent) and y(dependent) features**

In [66]:
x = df.drop('selling_price', axis=1)
y = df['selling_price']

print(x.shape)
print(y.shape)

(3550, 7)
(3550,)


**7. Train-Test Split on Data**

In [67]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15, random_state=81)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3017, 7)
(533, 7)
(3017,)
(533,)


**8. Model Evaluation Function**

In [68]:
def eval_model(model, x_train,y_train,x_test,y_test,mname):
    model.fit(x_train,y_train)
    ypred = model.predict(x_test)
    train_r2 = model.score(x_train,y_train)
    test_r2 = model.score(x_test,y_test)
    test_mae = mean_absolute_error(y_test,ypred)
    test_mse = mean_squared_error(y_test,ypred)
    test_rmse = np.sqrt(test_mse)
    data = {'Train_R2':train_r2,'Test_R2':test_r2,'Test_MAE':test_mae,
            'Test_MSE':test_mse, 'Test_RMSE':test_rmse}
    res = pd.DataFrame(data,index=[mname])
    return res 

**9. Comprehensive Evaluation and Comparison of Regression Models**

**1) LinReg**

In [69]:
lr1 = LinearRegression()
lr1_res = eval_model(lr1, x_train,y_train,x_test,y_test,'Linear Regression')
lr1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
Linear Regression,0.503125,0.568589,190590.64827,84237650000.0,290237.226303


**2) DT Reg**

In [70]:
dt1 = DecisionTreeRegressor(criterion= 'squared_error', max_depth= 8, max_features= None, min_samples_leaf= 7, min_samples_split= 2, splitter= 'best')
dt1_res = eval_model(dt1, x_train,y_train,x_test,y_test,'Decision Tree Regression')
dt1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
Decision Tree Regression,0.762127,0.746431,140764.483802,49512120000.0,222513.185062


**3) RF Reg**

In [71]:
rf1 = RandomForestRegressor(bootstrap=True,max_depth=9,min_samples_leaf=1,min_samples_split=5,n_estimators=300) 
# rf1 = RandomForestRegressor(bootstrap=True,max_depth=10,min_samples_leaf=1,min_samples_split=5,n_estimators=500) 
rf1_res = eval_model(rf1, x_train,y_train,x_test,y_test,'Random Forest Regression')
rf1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
Random Forest Regression,0.863412,0.815202,124742.234526,36083870000.0,189957.552089


**4) Ridge**

In [72]:
rid1 = Ridge()
rid1_res = eval_model(rid1, x_train,y_train,x_test,y_test,'Ridge')
rid1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
Ridge,0.503122,0.568242,190619.744606,84305510000.0,290354.10699


**5) Bagging Reg**

In [73]:
bag1 = BaggingRegressor(bootstrap=False,bootstrap_features=True,max_features=1.0,max_samples=0.7,n_estimators=500) 
bag1_res = eval_model(bag1, x_train,y_train,x_test,y_test,'Bagging Regression')
bag1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
Bagging Regression,0.904646,0.779868,138886.236727,42983180000.0,207323.842832


**6) AdaBoost Reg**

In [74]:
adab1 = AdaBoostRegressor(n_estimators=150,loss='square',learning_rate=1.0)
adab1_res = eval_model(adab1, x_train,y_train,x_test,y_test,'AdaBoost Regression')
adab1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
AdaBoost Regression,0.407798,0.485031,255486.262813,100553500000.0,317101.65171


**7) KNN Reg**

In [75]:
Knn1 = KNeighborsRegressor(metric='manhattan',n_neighbors=9) 
Knn1_res = eval_model(Knn1, x_train,y_train,x_test,y_test,'KNN Regression')
Knn1_res 

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
KNN Regression,0.397407,0.234937,218316.490723,149387100000.0,386506.221109


**10. Regression Model Performance Comparison**

In [76]:
all_res = pd.concat([lr1_res,dt1_res,rf1_res,rid1_res,bag1_res,adab1_res,Knn1_res])
all_res

Unnamed: 0,Train_R2,Test_R2,Test_MAE,Test_MSE,Test_RMSE
Linear Regression,0.503125,0.568589,190590.64827,84237650000.0,290237.226303
Decision Tree Regression,0.762127,0.746431,140764.483802,49512120000.0,222513.185062
Random Forest Regression,0.863412,0.815202,124742.234526,36083870000.0,189957.552089
Ridge,0.503122,0.568242,190619.744606,84305510000.0,290354.10699
Bagging Regression,0.904646,0.779868,138886.236727,42983180000.0,207323.842832
AdaBoost Regression,0.407798,0.485031,255486.262813,100553500000.0,317101.65171
KNN Regression,0.397407,0.234937,218316.490723,149387100000.0,386506.221109


#### **Conclusion:-**
**Random Forest Regression has performed best on this dataset.**

**11. Saving the best Model (Using pickle Library)**

In [77]:
pickle.dump(rf1,open('Random_Forest_Regressor.pkl','wb'))

**12. Loading the Model**

In [78]:
Loaded_model = pickle.load(open('Random_Forest_Regressor.pkl', 'rb'))

**13. Creating the Sample Dataset of the Original Data.**

In [79]:
sampled_dataset = df.sample(n=20, replace=False) 

**14. Saving the Sample Data**

In [80]:
sampled_dataset.to_csv('Sample Data.csv', index=False)

**15. Sample Dataset**

In [81]:
print(sampled_dataset.shape) 
sampled_dataset.head(5) 

(20, 8)


Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,brands
2260,2016,620000.0,70000.0,1,1,1,0,9
315,2012,350000.0,110000.0,1,1,1,2,13
1784,2018,850000.0,50000.0,1,1,1,0,9
3123,2009,420000.0,223158.4,1,0,1,0,16
2668,2011,300000.0,135000.0,1,1,1,2,16


**16. Checking how the model is performing on Sample dataset**

In [82]:
predictions = Loaded_model.predict(sampled_dataset.drop(columns=['selling_price']))
predictions[:5]

array([663004.42224265, 422521.73946401, 772639.86678337, 485965.33937125,
       485906.48508019])