# import important libraries

In [1]:
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score,
                            confusion_matrix,
                            classification_report)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

# Help functions 

#### convert object data type to float

In [2]:
# convert object to float
def obj_to_float(df,columns):
    for i in columns:
        df[f'variable{i}'] = df[f'variable{i}'].str.replace(',','.').astype('float64')


#### fill NaN values

In [3]:
# fill na values
def fill_na(df):
    for col in df.columns:
        if df[col].dtype=='O':
            df[col].fillna(df[col].mode(),inplace=True)
        else:
            df[col].fillna(df[col].mean(),inplace=True)

# load data

In [4]:
train_data = pd.read_csv('training.csv',delimiter=';')

# explore Data

In [5]:
train_data.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,1792,0.00054,u,g,c,v,175,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,1692,0.00335,y,p,k,v,29,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,3125,1125.0,u,g,ff,ff,0,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,4817,1335.0,u,g,i,o,335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,3233,35.0,u,g,k,v,5,f,f,0,t,g,232.0,0,2320000.0,f,0,no.


#### values in variable14 is the same values in variable 17 but the variable 17 is multiplication 10000

<font color='red'>remove variable17</font>

In [18]:
train_data = train_data.drop(columns=['variable17'],axis=1)

In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3700 entries, 0 to 3699
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   variable1   3661 non-null   object 
 1   variable2   3661 non-null   object 
 2   variable3   3700 non-null   object 
 3   variable4   3636 non-null   object 
 4   variable5   3636 non-null   object 
 5   variable6   3634 non-null   object 
 6   variable7   3634 non-null   object 
 7   variable8   3700 non-null   object 
 8   variable9   3700 non-null   object 
 9   variable10  3700 non-null   object 
 10  variable11  3700 non-null   int64  
 11  variable12  3700 non-null   object 
 12  variable13  3700 non-null   object 
 13  variable14  3600 non-null   float64
 14  variable15  3700 non-null   int64  
 15  variable18  1555 non-null   object 
 16  variable19  3700 non-null   int64  
 17  classLabel  3700 non-null   object 
dtypes: float64(1), int64(3), object(14)
memory usage: 520.4+ KB


In [20]:
obj_to_float(train_data,[2,3,8])

In [21]:
train_data = train_data.fillna(method='ffill')

In [22]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3700 entries, 0 to 3699
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   variable1   3700 non-null   object 
 1   variable2   3700 non-null   float64
 2   variable3   3700 non-null   float64
 3   variable4   3700 non-null   object 
 4   variable5   3700 non-null   object 
 5   variable6   3700 non-null   object 
 6   variable7   3700 non-null   object 
 7   variable8   3700 non-null   float64
 8   variable9   3700 non-null   object 
 9   variable10  3700 non-null   object 
 10  variable11  3700 non-null   int64  
 11  variable12  3700 non-null   object 
 12  variable13  3700 non-null   object 
 13  variable14  3700 non-null   float64
 14  variable15  3700 non-null   int64  
 15  variable18  3700 non-null   object 
 16  variable19  3700 non-null   int64  
 17  classLabel  3700 non-null   object 
dtypes: float64(4), int64(3), object(11)
memory usage: 520.4+ KB


In [23]:
def cat_to_numeric(df):
    lables = {}
    for col in df.columns:
        if df[col].dtype=='O':
            lables[col] = LabelEncoder()
            df[col]=lables[col].fit_transform(df[col])
    return lables

In [24]:
labels = cat_to_numeric(train_data)

In [108]:
# lables['classLabel'].inverse_transform([1])

array(['yes.'], dtype=object)

In [25]:
train_data

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable18,variable19,classLabel
0,0,17.92,0.000054,1,0,2,7,1.750,0,1,1,1,0,80.0,5,1,0,0
1,1,16.92,0.000034,2,2,9,7,0.290,0,0,0,0,2,200.0,0,1,0,0
2,1,31.25,0.000112,1,0,6,2,0.000,0,1,1,0,0,96.0,19,1,0,0
3,0,48.17,0.000133,1,0,7,6,0.335,0,0,0,0,0,0.0,120,1,0,0
4,1,32.33,0.000350,1,0,9,7,0.500,0,0,0,1,0,232.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695,0,18.75,0.000750,1,0,11,7,2.710,1,1,5,0,0,260.0,26726,1,1,1
3696,0,23.50,0.000900,1,0,11,7,8.500,1,1,5,1,0,120.0,0,1,1,1
3697,1,34.17,0.000917,1,0,2,7,4.500,1,1,12,1,0,0.0,221,1,1,1
3698,1,27.83,0.000154,1,0,0,7,3.750,1,1,5,1,0,100.0,3,1,1,1


# Train Model 

In [26]:
X,y = train_data.drop(['classLabel'],axis=1),train_data['classLabel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
tree = DecisionTreeClassifier()
cls = tree.fit(X_train,y_train)
y_pred = cls.predict(X_test)

In [28]:
result = confusion_matrix(y_test,y_pred)
result

array([[ 71,   0],
       [  0, 669]], dtype=int64)

In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       1.00      1.00      1.00       669

    accuracy                           1.00       740
   macro avg       1.00      1.00      1.00       740
weighted avg       1.00      1.00      1.00       740



In [30]:
accuracy_score(y_test,y_pred)

1.0

# load and test Validation data

In [44]:
validation_data = pd.read_csv('validation.csv',delimiter=';')
validation_data = validation_data.drop(columns=['variable17'],axis=1)

In [45]:
validation_data.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable18,variable19,classLabel
0,b,3233,75.0,u,g,e,bb,1585,t,f,0,t,s,420.0,0,,1,no.
1,b,2358,179.0,u,g,c,v,54,f,f,0,t,g,136.0,1,,0,no.
2,b,3642,0.00075,y,p,d,v,585,f,f,0,f,g,240.0,3,,1,no.
3,b,1842,10415.0,y,p,aa,v,125,t,f,0,f,g,120.0,375,,0,no.
4,b,245,13335.0,y,p,aa,v,4,f,f,0,t,g,120.0,475,f,1,no.


In [46]:
obj_to_float(validation_data,[2,3,8])

In [47]:
validation_data['variable18'].fillna(validation_data['variable18'].mode(),inplace=True)
validation_data = validation_data.fillna(method='ffill')

In [48]:
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   variable1   200 non-null    object 
 1   variable2   200 non-null    float64
 2   variable3   200 non-null    float64
 3   variable4   200 non-null    object 
 4   variable5   200 non-null    object 
 5   variable6   200 non-null    object 
 6   variable7   200 non-null    object 
 7   variable8   200 non-null    float64
 8   variable9   200 non-null    object 
 9   variable10  200 non-null    object 
 10  variable11  200 non-null    int64  
 11  variable12  200 non-null    object 
 12  variable13  200 non-null    object 
 13  variable14  200 non-null    float64
 14  variable15  200 non-null    int64  
 15  variable18  200 non-null    object 
 16  variable19  200 non-null    int64  
 17  classLabel  200 non-null    object 
dtypes: float64(4), int64(3), object(11)
memory usage: 28.2+ KB


In [49]:
labels_validate = cat_to_numeric(validation_data)

In [50]:
validation_data

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable18,variable19,classLabel
0,1,32.33,0.000750,0,0,5,0,1.585,1,0,0,1,2,420.0,0,0,1,0
1,1,23.58,0.000179,0,0,2,6,0.540,0,0,0,1,0,136.0,1,0,0,0
2,1,36.42,0.000075,1,1,4,6,0.585,0,0,0,0,0,240.0,3,0,1,0
3,1,18.42,0.001042,1,1,1,6,0.125,1,0,0,0,0,120.0,375,0,0,0
4,1,24.50,0.001334,1,1,1,6,0.040,0,0,0,1,0,120.0,475,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,23.08,0.001150,0,0,7,6,3.500,1,1,9,0,0,56.0,742,1,0,1
196,0,57.08,0.001950,0,0,2,6,5.500,1,1,7,0,0,0.0,3000,1,1,1
197,1,30.83,0.000000,0,0,0,6,1.250,1,1,1,0,0,202.0,0,1,0,1
198,1,19.67,0.000037,0,0,11,6,2.000,1,1,2,1,0,80.0,0,1,1,1


In [51]:
X,actual = validation_data.drop(columns=['variable18'],axis=1),validation_data['variable18']

In [52]:
predict = cls.predict(X)

In [66]:
result = confusion_matrix(actual,predict)
result

array([[90, 15],
       [17, 78]], dtype=int64)

In [54]:
accuracy_score(actual,predict)

0.84

# Save model 
#### Using pickle

In [55]:
wide_bot_model = 'DT_model_widebot.sav'
pickle.dump(cls, open(wide_bot_model, 'wb'))


In [30]:
# loaded_model = pickle.load(open(wide_bot_model, 'rb'))
# result = loaded_model.score(X_test, y_test)
# print(result)

1.0


# test Microservice

In [64]:
import json 
import requests
url  =  'http://127.0.0.1:5000/api'
data = json.dumps({'variable1':1,'variable2':32.33,'variable3':0.000750,
                   'variable4':0,'variable5':0,'variable6':5,
                    'variable7':0,'variable8':1.585,'variable9':1,
                    'variable10':0,'variable11':0,'variable12':1,
                    'variable13':2,'variable14':420.0,'variable15':0,
                    'variable16':2,'variable18':0,
                    'variable19':1,
})

r = requests.post(url,data)

In [65]:
r.json()

{'result': 'yes.'}