In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
#reading the dataset
df = pd.read_csv("ecoli.csv")
df.head()

Unnamed: 0,sequence_name,mcg,gvh,lip,chg,aac,alm1,alm2,lsp
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [5]:
#no. of rows x columns
df.shape

(336, 9)

In [7]:
#first column is not needed so droping it from the dataframe
df.drop(["sequence_name"], axis=1, inplace=True)
df.head()

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,lsp
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [13]:
#no. of classes = unique values in y column that is lsp
print("No. of classes =", df.lsp.nunique())
df.lsp.unique()

No. of classes = 8


array(['cp', 'im', 'imS', 'imL', 'imU', 'om', 'omL', 'pp'], dtype=object)

In [14]:
#splitting dataset into train and test
train, test = train_test_split(df, test_size=0.3)
print(train.shape)
print(test.shape)

(235, 8)
(101, 8)


In [15]:
#splitting train and test into X and Y
train_x = train.iloc[:, :-1]
train_y = train.iloc[:, -1:]
test_x = test.iloc[:, :-1]
test_y = test.iloc[:, -1:]

In [19]:
#final shapes of the datasets
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

(235, 7)
(101, 7)
(235, 1)
(101, 1)


In [20]:
#model
clf_entropy = dtc(criterion="entropy")
fit2 = clf_entropy.fit(train_x, train_y)

In [22]:
#prediction
pred_entropy = fit2.predict(test_x)

In [23]:
pred_entropy

array(['pp', 'cp', 'pp', 'cp', 'pp', 'im', 'cp', 'im', 'cp', 'im', 'cp',
       'cp', 'om', 'pp', 'pp', 'pp', 'im', 'im', 'im', 'cp', 'imU', 'im',
       'im', 'cp', 'cp', 'cp', 'im', 'im', 'pp', 'om', 'cp', 'im', 'imU',
       'imS', 'im', 'imU', 'pp', 'im', 'im', 'cp', 'om', 'cp', 'pp', 'cp',
       'cp', 'imU', 'pp', 'im', 'cp', 'cp', 'omL', 'cp', 'cp', 'cp',
       'imS', 'pp', 'pp', 'im', 'imS', 'pp', 'pp', 'cp', 'pp', 'im', 'im',
       'cp', 'im', 'cp', 'im', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'im',
       'im', 'cp', 'cp', 'pp', 'im', 'pp', 'pp', 'im', 'imS', 'cp', 'im',
       'om', 'pp', 'pp', 'im', 'imS', 'pp', 'imU', 'om', 'pp', 'cp', 'cp',
       'imU', 'cp', 'imU'], dtype=object)

In [25]:
#accuracy of the model, comparing the true value with the predicted values
accuracy_score(test_y, pred_entropy)

0.7227722772277227

In [26]:
#confusion_matrix(y_true, y_pred)
confusion_matrix(test_y, pred_entropy)

array([[34,  2,  0,  1,  0,  0,  0,  3],
       [ 0, 18,  0,  1,  4,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  5,  0,  1,  3,  0,  0,  0],
       [ 0,  0,  0,  2,  0,  3,  0,  3],
       [ 0,  0,  0,  0,  0,  0,  1,  1],
       [ 1,  0,  0,  0,  0,  2,  0, 14]])