In [125]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [126]:
data=pd.read_csv("cars_dataset.csv",na_values=['?','-10000'])
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
0,vhigh,vhigh,two,two,small,low,unacc
1,vhigh,vhigh,two,two,small,med,unacc
2,vhigh,vhigh,two,two,small,high,unacc
3,vhigh,vhigh,two,two,med,low,unacc
4,vhigh,vhigh,two,two,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [127]:
data.describe(include='all')

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,5more,four,big,low,unacc
freq,432,432,432,576,576,576,1210


In [128]:
data.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
car         0
dtype: int64

In [129]:
data.maint.value_counts()

vhigh    432
low      432
med      432
high     432
Name: maint, dtype: int64

In [130]:
cat_var=list(data.dtypes[data.dtypes==object].index)
cat_var.remove('car')
X=data[cat_var]
Y=data["car"]
for variable in cat_var:
    X[variable].fillna("Missing", inplace=True)
    dummies = pd.get_dummies(X[variable], prefix=variable)
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [131]:
list(data.dtypes[data.dtypes==object].index)

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']

In [132]:
X.columns

Index(['buying_high', 'buying_low', 'buying_med', 'buying_vhigh', 'maint_high',
       'maint_low', 'maint_med', 'maint_vhigh', 'doors_5more', 'doors_four',
       'doors_three', 'doors_two', 'persons_four', 'persons_more',
       'persons_two', 'lug_boot_big', 'lug_boot_med', 'lug_boot_small',
       'safety_high', 'safety_low', 'safety_med'],
      dtype='object')

In [133]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=100,test_size=0.2)

In [134]:
scaler=StandardScaler()
scaled_X_train=scaler.fit_transform(X_train)
scaled_X_test=scaler.transform(X_test)
model=RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=-1)
model.fit(scaled_X_train,Y_train)
model.oob_score_

0.9703328509406657

In [135]:
model.feature_importances_

array([0.03906036, 0.04919914, 0.03659731, 0.04428898, 0.03668584,
       0.04371455, 0.03454051, 0.05239837, 0.01505786, 0.01635398,
       0.01593342, 0.02947288, 0.04736117, 0.04350101, 0.13672449,
       0.03311129, 0.02100713, 0.04487451, 0.07968732, 0.13018751,
       0.05024236])

In [136]:
pd.Series(model.feature_importances_, index=X.columns)

buying_high       0.039060
buying_low        0.049199
buying_med        0.036597
buying_vhigh      0.044289
maint_high        0.036686
maint_low         0.043715
maint_med         0.034541
maint_vhigh       0.052398
doors_5more       0.015058
doors_four        0.016354
doors_three       0.015933
doors_two         0.029473
persons_four      0.047361
persons_more      0.043501
persons_two       0.136724
lug_boot_big      0.033111
lug_boot_med      0.021007
lug_boot_small    0.044875
safety_high       0.079687
safety_low        0.130188
safety_med        0.050242
dtype: float64

In [137]:
y_pred=model.predict(scaled_X_test)

In [138]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,Y_test)

array([[ 66,   3,   3,   3],
       [  1,   9,   0,   0],
       [  2,   0, 246,   0],
       [  0,   2,   0,  11]], dtype=int64)

In [139]:
y_pred

array(['unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc',
       'unacc', 'acc', 'unacc', 'acc', 'acc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc',
       'unacc', 'vgood', 'unacc', 'acc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'acc', 'acc', 'acc', 'vgood', 'unacc', 'unacc',
       'good', 'good', 'vgood', 'unacc', 'unacc', 'acc', 'unacc', 'unacc',
       'unacc', 'unacc', 'vgood', 'unacc', 'unacc', 'unacc', 'good',
       'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc',
       'unacc', 'vgood', 'unacc', 'vgood', 'unacc', 'vgood', 'unacc',
       'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'acc',
       'acc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'acc',
       'acc', 'good', 'unacc', 'acc', 'good', 'unacc', 'unacc', 'unacc',
       'vgood', 'unacc', 'unacc', 'unacc', 'acc', 'unacc

In [140]:
Y_test

27      unacc
1156    unacc
1668    unacc
1622    unacc
692       acc
        ...  
275     unacc
999     unacc
642     unacc
1197    unacc
1555      acc
Name: car, Length: 346, dtype: object

In [141]:
Y_test.value_counts()

unacc    249
acc       69
vgood     14
good      14
Name: car, dtype: int64

In [142]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,Y_test)

0.9595375722543352

In [143]:
data_new=pd.DataFrame()
data_new['y_pred']=y_pred

In [144]:
data_new.y_pred.value_counts()

unacc    248
acc       75
vgood     13
good      10
Name: y_pred, dtype: int64