# Import Library

In [581]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import sklearn untuk standardscaler standarisasi data
from sklearn.preprocessing import StandardScaler 

# Import sklearn untuk menangani nilai NULL
from sklearn.impute import SimpleImputer

# Import sklearn untuk training data
from sklearn.model_selection import train_test_split

# Import sklearn untuk KNN
from sklearn.neighbors import KNeighborsClassifier

# Import sklearn untuk Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Data Cleaning pada dataset

### Menangani nilai NULL

In [152]:
data_clean = pd.read_csv('CO2 Emissions_Canada_Rusak.csv')

In [153]:
data_clean.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,
1,,ILX,COMPACT,2.4,4,M6,Z,11.2,,9.6,29,
2,ACURA,,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,
3,ACURA,MDX 4WD,,3.5,6,AS6,Z,12.7,,11.1,25,
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,,8.7,10.6,27,244.0


In [154]:
data_clean.dtypes

Make                                 object
Model                                object
Vehicle Class                        object
Engine Size(L)                      float64
Cylinders                             int64
Transmission                         object
Fuel Type                            object
Fuel Consumption City (L/100 km)    float64
Fuel Consumption Hwy (L/100 km)     float64
Fuel Consumption Comb (L/100 km)    float64
Fuel Consumption Comb (mpg)           int64
CO2 Emissions(g/km)                 float64
dtype: object

In [155]:
data_clean.isna().sum()

Make                                6
Model                               4
Vehicle Class                       6
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    4
Fuel Consumption Hwy (L/100 km)     3
Fuel Consumption Comb (L/100 km)    3
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 7
dtype: int64

In [156]:
imputer = SimpleImputer(strategy='median')
data_clean['CO2 Emissions(g/km)'] = imputer.fit_transform(data_clean[['CO2 Emissions(g/km)']])

In [157]:
imputer = SimpleImputer(strategy='mean')
data_clean['Fuel Consumption Comb (L/100 km)'] = imputer.fit_transform(data_clean[['Fuel Consumption Comb (L/100 km)']])
data_clean['Fuel Consumption Hwy (L/100 km)'] = imputer.fit_transform(data_clean[['Fuel Consumption Hwy (L/100 km)']])
data_clean['Fuel Consumption City (L/100 km)'] = imputer.fit_transform(data_clean[['Fuel Consumption City (L/100 km)']])

In [158]:
imputer = SimpleImputer(strategy='most_frequent')
data_clean['Make'] = imputer.fit_transform(data_clean[['Make']])
data_clean['Model'] = imputer.fit_transform(data_clean[['Model']])
data_clean['Vehicle Class'] = imputer.fit_transform(data_clean[['Vehicle Class']])

In [159]:
data_clean.isna().sum()

Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
dtype: int64

In [160]:
data_clean.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,246.0
1,FORD,ILX,COMPACT,2.4,4,M6,Z,11.2,9.04217,9.6,29,246.0
2,ACURA,F-150 FFV,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,246.0
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.04217,11.1,25,246.0
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.556171,8.7,10.6,27,244.0


### Menangani Nilai duplikat

In [161]:
data_clean[data_clean.duplicated()]

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
1082,ASTON MARTIN,DB9,MINICOMPACT,5.9,12,A6,Z,18.0,12.6,15.6,18,359.0
1105,AUDI,A6 QUATTRO TDI (modified),MID-SIZE,3.0,6,AS8,D,9.8,6.2,8.1,35,217.0
1107,AUDI,A7 QUATTRO TDI (modified),MID-SIZE,3.0,6,AS8,D,9.8,6.2,8.1,35,217.0
1110,AUDI,A8 TDI (modified),MID-SIZE,3.0,6,AS8,D,9.8,6.5,8.4,34,224.0
1114,AUDI,A8L TDI (modified),FULL-SIZE,3.0,6,AS8,D,9.8,6.5,8.4,34,224.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7356,TOYOTA,Tundra,PICKUP TRUCK - STANDARD,5.7,8,AS6,X,17.7,13.6,15.9,18,371.0
7365,VOLKSWAGEN,Golf GTI,COMPACT,2.0,4,M6,X,9.8,7.3,8.7,32,203.0
7366,VOLKSWAGEN,Jetta,COMPACT,1.4,4,AS8,X,7.8,5.9,7.0,40,162.0
7367,VOLKSWAGEN,Jetta,COMPACT,1.4,4,M6,X,7.9,5.9,7.0,40,163.0


In [162]:
data_clean.drop_duplicates(inplace=True)

In [163]:
data_clean[data_clean.duplicated()]

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)


In [164]:
data_clean.duplicated().sum()

0

In [165]:
data_clean.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,246.0
1,FORD,ILX,COMPACT,2.4,4,M6,Z,11.2,9.04217,9.6,29,246.0
2,ACURA,F-150 FFV,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,246.0
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.04217,11.1,25,246.0
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.556171,8.7,10.6,27,244.0


# Standarisasi data untuk hasil yang lebih akurat

In [490]:
data_clean_standard = data_clean.copy()

In [494]:
np.std(data_clean_standard)

Engine Size(L)                       1.364840
Cylinders                            1.845843
Fuel Consumption City (L/100 km)     3.551712
Fuel Consumption Hwy (L/100 km)      2.277975
Fuel Consumption Comb (L/100 km)     2.945963
Fuel Consumption Comb (mpg)          7.243341
CO2 Emissions(g/km)                 59.248665
dtype: float64

In [497]:
standard_scaler = StandardScaler()

In [498]:
x_standard = standard_scaler.fit_transform(
    data_clean_standard[[
        'Engine Size(L)','Cylinders','Fuel Consumption City (L/100 km)',
        'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
        'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)']]
)

In [499]:
np.std(x_standard)

1.0

In [501]:
data_clean_baru = pd.DataFrame(x_standard)

In [502]:
data_clean_baru.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.851504,-0.877427,-0.763207,-1.040898,-0.855017,0.771855,-0.087598
1,-0.558429,-0.877427,-0.397186,-0.012717,-0.481625,0.219623,-0.087598
2,-1.217847,-0.877427,-1.861269,-1.435986,-1.737581,2.842722,-0.087598
3,0.247526,0.206089,0.025145,-0.012717,0.027546,-0.332608,-0.087598
4,0.247526,0.206089,-0.01535,-0.162925,-0.142177,-0.056492,-0.121354


# Data Splitting

In [559]:
X = data_clean_baru[[0,1,2,3,4,5,6]]
Y = data_clean['Fuel Type']

In [560]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3 , random_state = 1)

# KNN

In [561]:
modelKNN = KNeighborsClassifier(n_neighbors = 3)

In [562]:
modelKNN.fit(X_train, Y_train)

In [563]:
modelKNN.score(X_train, Y_train)

0.9404680754374006

In [564]:
modelKNN.score(X_test, Y_test)

0.8855325914149443

# Naive Bayes

In [565]:
gnb = GaussianNB().fit(X_train, Y_train)

In [566]:
gnb_predict = gnb.predict(X_test)

In [572]:
accuracy_score(gnb_predict, Y_test)

0.5013248542660307

# Perbandingan KNN dan Naive Bayes

In [580]:
print("Hasil Score dari KNN : ")
print("Training set =",modelKNN.score(X_train, Y_train))
print("Testing set =",modelKNN.score(X_test, Y_test))
print("\n")
print("Hasil Score dari Naive Bayes : ")
print("Prediksi Akurasi =",accuracy_score(gnb_predict, Y_test))

Hasil Score dari KNN : 
Training set = 0.9404680754374006
Testing set = 0.8855325914149443


Hasil Score dari Naive Bayes : 
Prediksi Akurasi = 0.5013248542660307
