#### Importing Libraries

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor

#### Loading CSV File for Modeling

In [235]:
df = pd.read_csv('Car_selling_db.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


#### Ensure the number of rows and columns match the file 


In [236]:
df.shape

(8128, 13)

#### Check if there are empty values in the dataset

In [237]:
df.isna().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

#### Drop rows where all elements are missing or too much redundant information

In [238]:

newdb = df.copy()
newdb = newdb.drop(['torque'], axis = 1)
newdb = newdb.dropna()

#### Remove Units 

##### Mileage Column has instances of km/kg. So convert km/kg to kmpl and then remove

In [239]:

correct_mileage= []
for i in newdb.mileage: 
    if str(i).endswith('km/kg'):
        i = i[:-6]
        i = float(i)*1.40
        correct_mileage.append(float(i))
    elif str(i).endswith('kmpl'):
        i = i[:-6]
        correct_mileage.append(float(i))

newdb['mileage'] = correct_mileage


##### Remove units from Engine and Max_Power

In [240]:

newdb['engine'] = pd.to_numeric(newdb['engine'].str.strip('CC '))
newdb['max_power'] = pd.to_numeric(newdb['max_power'].str.strip('bhp '))

#### Encode values in Fuel, Seller_type, Transmission, and Owner Columns

In [241]:
LabelEncoder_df = LabelEncoder()
for i in range(4,8):
    newdb.iloc[:,i] = LabelEncoder_df.fit_transform(newdb.iloc[:,i])

In [242]:
newdb = newdb.dropna()
newdb.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,1,1,1,0,23.0,1248,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,1,1,1,2,21.1,1498,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,3,1,1,4,17.0,1497,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,1,1,1,0,23.0,1396,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,3,1,1,0,16.0,1298,88.2,5.0


#### Split data to x and y datasets

In [243]:
X = newdb.iloc[:,3:11].astype('float')
Y = newdb.iloc[:,2].astype('float')

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.4,random_state=0)

#### Normalizing for Testing Purposes. Try MinMaxScaler, Normalizer, and StandardScaler
##### I got bad results

In [244]:
#    sc = MinMaxScaler()
#    X_test = pd.DataFrame(sc.fit_transform(X_test))
#    X_train = pd.DataFrame(sc.fit_transform(X_train))
#    X_train.head()

#### Creating a function for the models 

In [245]:
def models(X_train, Y_train):

    #Linear Regression Model
    lin = LinearRegression()
    lin.fit(X_train,Y_train)


    #Decision Tree Model
    tree1 = DecisionTreeClassifier()
    tree1.fit(X_train,Y_train)

    tree = DecisionTreeRegressor()
    tree.fit(X_train,Y_train)

    forest = RandomForestRegressor()
    forest.fit(X_train,Y_train)

    #KNN Classifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train,Y_train)

    #print accuracies of each model 
    print("[0] Linear Regression Training Accuracy: ", lin.score(X_train,Y_train))  
    print("[1] Decision Tree Classifier Training Accuracy: ", tree1.score(X_train,Y_train))   
    print("[2] Decision Tree Regressor Training Accuracy: ", tree.score(X_train,Y_train))  
    print("[3] Random Forest Regressor Training Accuracy: ", forest.score(X_train,Y_train))  
    print("[4] KNN Classifier Training Accuracy: ", neigh.score(X_train, Y_train))
    return lin, tree1, tree, forest, neigh


#### Find the Accuracies of each model on Training Dataset

In [246]:
model = models(X_train, Y_train)

[0] Linear Regression Training Accuracy:  0.6663558663174884
[1] Decision Tree Classifier Training Accuracy:  0.8749736453721273
[2] Decision Tree Regressor Training Accuracy:  0.9990803420446591
[3] Random Forest Regressor Training Accuracy:  0.9901339367232214
[4] KNN Classifier Training Accuracy:  0.44676365169723803


#### Now Find the Accuracies of each model on Testing Dataset

In [247]:
for i in range (4):
    preds = model[i].predict(X_test)
    r2 = r2_score(Y_test, preds)
    print("Accuracy for Model:", i+1, " is ", r2*100, ' %')

Accuracy for Model: 1  is  66.19558082592893  %
Accuracy for Model: 2  is  88.62278338373976  %
Accuracy for Model: 3  is  91.0577399461412  %
Accuracy for Model: 4  is  94.45524659918765  %


#### Prediction of Model vs Actual Data

In [248]:
for i in range(4):
    print('\n Model Prediction: ',i+1,preds)
    print('\n Actual', Y_test.values)
    print("______________________________________________")


 Model Prediction:  1 [452096.65666667 779000.         195561.42857143 ... 740000.
 799464.4047619  445361.48666667]

 Actual [430000. 779000. 200000. ... 740000. 900000. 355000.]
______________________________________________

 Model Prediction:  2 [452096.65666667 779000.         195561.42857143 ... 740000.
 799464.4047619  445361.48666667]

 Actual [430000. 779000. 200000. ... 740000. 900000. 355000.]
______________________________________________

 Model Prediction:  3 [452096.65666667 779000.         195561.42857143 ... 740000.
 799464.4047619  445361.48666667]

 Actual [430000. 779000. 200000. ... 740000. 900000. 355000.]
______________________________________________

 Model Prediction:  4 [452096.65666667 779000.         195561.42857143 ... 740000.
 799464.4047619  445361.48666667]

 Actual [430000. 779000. 200000. ... 740000. 900000. 355000.]
______________________________________________


# The End