In [46]:
import pandas as pd
import numpy as np

In [114]:

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


## Loading CSV File for Modeling

In [93]:
df = pd.read_csv('Car_selling_db.csv')
df.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5.0


##### Ensure the number of rows and columns match the file 


In [94]:
df.shape

(8128, 13)

##### Chekc if there are empty values in the dataset

In [171]:
df.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

##### drop rows where all elements are missing.

In [183]:
df.iloc[:,8].value_counts()

18.9 kmpl      225
19.7 kmpl      173
18.6 kmpl      164
21.1 kmpl      157
17.0 kmpl      133
              ... 
16.34 kmpl       1
19.69 kmpl       1
20.88 km/kg      1
17.8 km/kg       1
16.51 kmpl       1
Name: mileage, Length: 393, dtype: int64

In [184]:
LabelEncoder_df = LabelEncoder()
df.iloc[:,8] = LabelEncoder_df.fit_transform(df.iloc[:,8])
df.iloc[:,8].value_counts()

210    225
393    221
238    173
204    164
281    157
      ... 
140      1
237      1
272      1
182      1
146      1
Name: mileage, Length: 394, dtype: int64

In [169]:
newdb = df.dropna()
OneHotEncoder_df = OneHotEncoder(categories='auto')
for i in range(4,12):
    newdb.iloc[:,i] = LabelEncoder_df.fit_transform(newdb.iloc[:,i])
newdb.iloc[3:11]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
3,Hyundai i20 Sportz Diesel,2010,225000,127000,1,1,1,0,327,24,295,224,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,3,1,1,0,136,14,286,21,5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,3,1,1,0,246,10,261,36,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,2,1,1,0,168,1,193,402,5.0
7,Maruti 800 DX BSII,2001,45000,5000,3,1,1,2,136,112,170,388,4.0
8,Toyota Etios VXD,2011,350000,90000,1,1,1,0,338,19,222,144,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,1,1,1,0,244,25,226,139,5.0
10,Renault Duster 110PS Diesel RxL,2014,500000,68000,1,1,1,2,213,29,23,249,5.0


In [129]:
newdb.shape

(7906, 13)

### Split data to x and y datasets

In [140]:
X = newdb.iloc[:,3:13]
Y = newdb.iloc[:,2]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)


### Creating a function for the models 

In [157]:
def models(X_train, Y_train):

    #Linear Regression Model
    log = LogisticRegression(random_state=1)
    log.fit(X_train,Y_train)

    #Decision Tree Model
    tree = DecisionTreeClassifier(criterion='entropy', random_state=1)
    tree.fit(X_train,Y_train)

    #Random Forest Classifier
    forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=1)
    forest.fit(X_train,Y_train)

    #Support Vector Machine
    clf = svm.SVC()
    clf.fit(X_train,Y_train)

    #KNN Classifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train,Y_train)

    #print accuracies of each model 
    print("[0] Logistic Regression Training Accuracy: ", log.score(X_train,Y_train))  
    print("[1] Decision Tree Classifier Training Accuracy: ", tree.score(X_train,Y_train))  
    print("[2] Random Forest Classifier Training Accuracy: ", forest.score(X_train,Y_train))  
    print("[3] Support Vector Machine Classifier Training Accuracy: ", clf.score(X_train,Y_train))  
    print("[4] KNN Classifier Training Accuracy: ", neigh.score(X_train, Y_train))
    return log, tree, forest, clf, neigh
