In [101]:
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

Mounted at /content/drive/


In [102]:
import pandas as pd                               #here we imported libraries which are useful for project 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier


In [103]:
!pip install --upgrade category_encoders #installed category encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [104]:
data = pd.read_csv('/content/drive/MyDrive/Projects/CarProject/cardata.txt')       # data loaded

In [105]:
data.head()            #here we see top 5 rows using head()

Unnamed: 0,med,med.1,3,more,small,low,unacc
0,high,low,4,4,med,high,acc
1,vhigh,vhigh,3,more,big,med,unacc
2,high,vhigh,3,4,big,med,unacc
3,med,low,3,more,big,high,vgood
4,vhigh,low,4,4,big,med,acc


### Feature engineering on Data Dataset

In [106]:
data.shape # here we finding the shape of the data

(1727, 7)

#### Finding Missing value
###### In the below code we found that there are no null values in the data

In [107]:
data.isnull().sum()

med      0
med.1    0
3        0
more     0
small    0
low      0
unacc    0
dtype: int64

In [108]:
data_col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'] 
data.columns = data_col_names    # In data there is no proper index, so we are changing the index here 
data_col_names

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [109]:
data.info() #here we are checking information of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [110]:
data_col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for col in data_col_names:
    print(data[col].value_counts())       #here we counting each feature value

high     432
vhigh    432
low      432
med      431
Name: buying, dtype: int64
low      432
vhigh    432
high     432
med      431
Name: maint, dtype: int64
4        432
2        432
5more    432
3        431
Name: doors, dtype: int64
4       576
2       576
more    575
Name: persons, dtype: int64
med      576
big      576
small    575
Name: lug_boot, dtype: int64
high    576
med     576
low     575
Name: safety, dtype: int64
unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64


In [111]:
#Declare Dependent and Independent Variable

X = data.drop(['class'], axis=1)
y = data['class']

In [112]:
#here we are doing encoding
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
X = encoder.fit_transform(X)  

In [113]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [114]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((1157, 6), (570, 6))

In [115]:
#performing encoding on X_train,X_test

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [116]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
83,1,1,1,1,1,1
48,2,1,2,2,1,2
468,3,2,2,2,2,2
155,4,2,2,2,2,1
1043,1,3,3,3,2,1


In [117]:
X_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
599,4,2,2,3,3,2
932,3,1,2,3,2,3
628,1,2,3,3,1,2
1497,3,1,2,2,2,2
1262,2,3,2,3,1,2


# Part 1
## Applying Navie bayas Algorithm

In [118]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)

GaussianNB()

In [119]:
# making predictions on the testing set
y_pred = gnb.predict(X_test)

In [120]:
# finding accuracy
print("accuracy(in%) :", metrics.accuracy_score(y_test,y_pred)*100) 

accuracy(in%) : 62.28070175438597


# Part 2
## KFlod

In [121]:
# Applying KFold

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
model = GaussianNB()
kfold_validation = KFold(10)

In [122]:
results = cross_val_score(model,X,y,cv=kfold_validation) #cv knows as cross validation
print(results) 
print(np.mean(results)) 

[0.71098266 0.69942197 0.61849711 0.6416185  0.63583815 0.69942197
 0.63583815 0.68023256 0.68604651 0.61627907]
0.6624176636644711


In [123]:
#confusion matrix on Data dataset
cm = confusion_matrix(y_test,y_pred)
print("confusion Matrix is:",cm,sep='\n')

confusion Matrix is:
[[ 18   0  38  66]
 [  4   0   4   8]
 [ 10   0 321  85]
 [  0   0   0  16]]


## Finding probability of each value in Class variable

In [124]:
import random 
def random_class():
  return random.choice(["unacc","acc","good","vgood"])

In [125]:
from statistics import mean
import scipy
from scipy.stats import norm

In [126]:
def fit_distribution(X):
	# estimate parameters
	mu = mean(X)
	sigma = np.std(X)
	print(mu, sigma)
	# fit distribution
	dist = norm(mu, sigma)
	return dist

In [127]:
Xy0 = X[y == 'unacc']
Xy1 = X[y == 'acc']
Xy2 = X[y == 'good']
Xy3 = X[y == 'vgood']
print(Xy0.shape, Xy1.shape,Xy2.shape,Xy3.shape)



(1209, 6) (384, 6) (69, 6) (65, 6)


##### In the below code we found each value probability percentage of Class Variable
##### we got :
##### unacc - 70%, acc - 22, good - 3.9%, vgood - 3.7%

In [128]:
# calculate priors
priory0 = len(Xy0) / len(X)
priory1 = len(Xy1) / len(X)
priory2 = len(Xy2) / len(X)
priory3 = len(Xy3) / len(X)
print(priory0, priory1,priory2,priory3)

0.7000579038795599 0.22235089751013318 0.039953676896352056 0.03763752171395483


In [129]:
# classify with one example
#Xsample, ysample = X["class"], y["class"]

In [158]:
X$output = y_pred
write.csv(X_test,'Car_Evaluation_OP.csv')

SyntaxError: ignored

# Part 3
## Now we are working on individual train and test dataset

### Train Data

In [130]:
train = pd.read_csv('/content/drive/MyDrive/Projects/CarProject/car.train.txt')
test = pd.read_csv('/content/drive/MyDrive/Projects/CarProject/cartest.txt')

In [131]:
train.head()

Unnamed: 0,med,med.1,3,more,small,low,unacc
0,high,low,4,4,med,high,acc
1,vhigh,vhigh,3,more,big,med,unacc
2,high,vhigh,3,4,big,med,unacc
3,med,low,3,more,big,high,vgood
4,vhigh,low,4,4,big,med,acc


##### Doing Analysis and Feature engineering on train data

In [132]:
train.shape

(1399, 7)

In [133]:
train.isna().sum()

med      0
med.1    0
3        0
more     0
small    0
low      0
unacc    0
dtype: int64

In [134]:
train_col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
train.columns = train_col_names
train_col_names

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [135]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1399 entries, 0 to 1398
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1399 non-null   object
 1   maint     1399 non-null   object
 2   doors     1399 non-null   object
 3   persons   1399 non-null   object
 4   lug_boot  1399 non-null   object
 5   safety    1399 non-null   object
 6   class     1399 non-null   object
dtypes: object(7)
memory usage: 76.6+ KB


In [136]:
train_col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for col in train_col_names:
    print(train[col].value_counts())

vhigh    354
high     352
med      352
low      341
Name: buying, dtype: int64
low      357
vhigh    353
high     346
med      343
Name: maint, dtype: int64
5more    356
4        352
3        346
2        345
Name: doors, dtype: int64
2       470
4       465
more    464
Name: persons, dtype: int64
med      468
big      468
small    463
Name: lug_boot, dtype: int64
low     474
high    472
med     453
Name: safety, dtype: int64
unacc    991
acc      305
good      54
vgood     49
Name: class, dtype: int64


In [137]:
#Declaring Dependent and Independent Variable for train data

x_train = train.drop(['class'], axis=1)

y_train = train['class']

In [138]:
#Encode categorical variables on train data

x_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,high,low,4,4,med,high
1,vhigh,vhigh,3,more,big,med
2,high,vhigh,3,4,big,med
3,med,low,3,more,big,high
4,vhigh,low,4,4,big,med


In [139]:
#We can see above that all the variables are ordinal categorical data type.
# encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])


x_train = encoder.fit_transform(x_train)

In [140]:
x_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,1,1,1,1,1,1
1,2,2,2,2,2,2
2,1,2,2,1,2,2
3,3,1,2,2,2,1
4,2,1,1,1,2,2


### Test Data

In [141]:
test.head()

Unnamed: 0,vhigh,med,5more,4,med.1,low,unacc
0,high,med,3,2,small,high,unacc
1,vhigh,med,2,4,big,high,acc
2,vhigh,high,4,2,big,low,unacc
3,vhigh,vhigh,3,4,big,med,unacc
4,med,high,3,more,small,low,unacc


#### Doing Analysis and Feature engineering on Test data

In [142]:
test.shape

(327, 7)

In [143]:
test.isna().sum()

vhigh    0
med      0
5more    0
4        0
med.1    0
low      0
unacc    0
dtype: int64

In [144]:
test_col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
test.columns = test_col_names
test_col_names

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [145]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327 entries, 0 to 326
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    327 non-null    object
 1   maint     327 non-null    object
 2   doors     327 non-null    object
 3   persons   327 non-null    object
 4   lug_boot  327 non-null    object
 5   safety    327 non-null    object
 6   class     327 non-null    object
dtypes: object(7)
memory usage: 18.0+ KB


In [146]:
test_col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for col in test_col_names:
    print(test[col].value_counts())

low      91
high     80
med      79
vhigh    77
Name: buying, dtype: int64
med      87
high     86
vhigh    79
low      75
Name: maint, dtype: int64
2        87
3        85
4        80
5more    75
Name: doors, dtype: int64
more    111
4       110
2       106
Name: persons, dtype: int64
small    112
big      108
med      107
Name: lug_boot, dtype: int64
med     123
high    104
low     100
Name: safety, dtype: int64
unacc    217
acc       79
vgood     16
good      15
Name: class, dtype: int64


In [147]:
#Declare Dependent and Independent Variable for test data
x_test = test.drop(['class'], axis=1)

y_test = test['class']

In [148]:
#Encode categorical variables on test data

x_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,high,med,3,2,small,high
1,vhigh,med,2,4,big,high
2,vhigh,high,4,2,big,low
3,vhigh,vhigh,3,4,big,med
4,med,high,3,more,small,low


In [149]:
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])

x_test = encoder.fit_transform(x_test)

x_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,1,1,1,1,1,1
1,2,1,2,2,2,1
2,2,2,3,1,2,2
3,2,3,1,2,2,3
4,3,2,1,3,1,2


#### Applying Navie bayas Algorithm

In [150]:
gnb = GaussianNB()
gnb.fit(x_train,y_train)

GaussianNB()

In [151]:
## making predictions on the testing set
y_pred = gnb.predict(x_test)

In [152]:
print("accuracy(in%) :", metrics.accuracy_score(y_test,y_pred)*100)

accuracy(in%) : 55.96330275229357


#### KFold

In [153]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
model = GaussianNB()
kfold_validation = KFold(10)

Mam check results-55 column(next line code), I'm unable to insert X,y values from individual train and test data sets. SO I USED X,y FROM DATA thats the reason i got 66% for below code but i have to get 55% as i got in print(accuracy(in%)-53 column above KFold.


In [154]:
results = cross_val_score(model,X,y,cv=kfold_validation) 
print(results)
print(np.mean(results))

[0.71098266 0.69942197 0.61849711 0.6416185  0.63583815 0.69942197
 0.63583815 0.68023256 0.68604651 0.61627907]
0.6624176636644711


# Confusion matix for train and test datasets

In [155]:
cm = confusion_matrix(y_test,y_pred)
print("confusion Matrix is:",cm,sep='\n')

confusion Matrix is:
[[  3   0  56  20]
 [  0   0   9   6]
 [ 14   5 164  34]
 [  0   0   0  16]]
