# Kutlwano Classification Model

We want to determine which model will provide the best predictions for our Medical Data. The best model will be determined from the following 5 models below:

* Logistic Regressions
* K Nearest Neighbors
* Decision Trees
* Random Forests
* Support Vector Machines

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sqlalchemy as sa
from sqlalchemy import create_engine

## Connect to Database

In [2]:
Username = 'admin'
Password = 'amazing_people'
Host = 'alphacode-explore.ccwgqdqrrmvt.eu-west-1.rds.amazonaws.com'
Port = '1433'
Database = 'Kutlwano'

In [3]:
connection = create_engine(
    'mssql+pymssql://' +
    Username + ':' + Password + '@' + Host + ':' + Port + '/' + Database
)

In [4]:
sql_query1 = "Select * from claims_data"
sql_query2 = "Select * from test_set"

In [5]:
df = pd.read_sql_query(sql_query1, connection)
df_test = pd.read_sql_query(sql_query2, connection)

### Pre-Processing

In [6]:
# Lables
y = df['insurance_claim']

# Features
X = df.drop(['insurance_claim', 'claim_amount'], axis=1)

In [7]:
# Transforming the Features
X_transformed = pd.get_dummies(X, drop_first=True)

## K Nearest Neighbour Model

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)
X_standardize = pd.DataFrame(X_scaled,columns=X_transformed.columns)

### Split the data into Training and Testing data

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_standardize, y, test_size=0.3, random_state=50)

### Training

In [11]:
from sklearn.neighbors import KNeighborsClassifier

In [12]:
knn = KNeighborsClassifier()

In [13]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Predicting

In [14]:
pred_knn = knn.predict(X_test)

### Testing

In [15]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Confusion Matrix

In [16]:
labels = ['No', 'Yes']

print('Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_knn), index=labels, columns=labels)

Confusion Matrix


Unnamed: 0,No,Yes
No,63,108
Yes,35,196


Classification Report

In [17]:
print('Classification Report')
print(classification_report(y_test, pred_knn, target_names=['No', 'Yes']))

Classification Report
              precision    recall  f1-score   support

          No       0.64      0.37      0.47       171
         Yes       0.64      0.85      0.73       231

    accuracy                           0.64       402
   macro avg       0.64      0.61      0.60       402
weighted avg       0.64      0.64      0.62       402



### Tuning Parameters to Improve KNN Model

Let's use different values of k and compare the results:

#### Train

In [18]:
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_20 = KNeighborsClassifier(n_neighbors=20)

In [19]:
knn_3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [20]:
knn_5.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [21]:
knn_20.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

#### Predict

In [22]:
pred_knn_3 = knn_3.predict(X_test)
pred_knn_5 = knn_5.predict(X_test)
pred_knn_20 = knn_20.predict(X_test)

#### Test

Confusion Matrix

In [23]:
labels = ['No', 'Yes']

pred = {'k=3':pred_knn_3, 'k=5':pred_knn_5, 'k=20':pred_knn_20}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

k=3
     No  Yes
No   58  113
Yes  39  192


k=5
     No  Yes
No   63  108
Yes  35  196


k=20
     No  Yes
No   75   96
Yes  45  186




Classification Report

In [24]:
pred = {'k=3':pred_knn_3, 'k=5':pred_knn_5, 'k=20':pred_knn_20}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

k=3
              precision    recall  f1-score   support

          No       0.60      0.34      0.43       171
         Yes       0.63      0.83      0.72       231

    accuracy                           0.62       402
   macro avg       0.61      0.59      0.57       402
weighted avg       0.62      0.62      0.60       402



k=5
              precision    recall  f1-score   support

          No       0.64      0.37      0.47       171
         Yes       0.64      0.85      0.73       231

    accuracy                           0.64       402
   macro avg       0.64      0.61      0.60       402
weighted avg       0.64      0.64      0.62       402



k=20
              precision    recall  f1-score   support

          No       0.62      0.44      0.52       171
         Yes       0.66      0.81      0.73       231

    accuracy                           0.65       402
   macro avg       0.64      0.62      0.62       402
weighted avg       0.64      0.65      0.64       402





## Decision Trees Model

### Training

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
tree = DecisionTreeClassifier(random_state=50)

In [27]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=50, splitter='best')

### Predicting

In [28]:
pred_tree = tree.predict(X_test)

### Testing

Confusion Matrix

In [29]:
labels = ['No', 'Yes']

print('Decision Tree - Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_tree), index=labels, columns=labels)

Decision Tree - Confusion Matrix


Unnamed: 0,No,Yes
No,131,40
Yes,24,207


Classification Report

In [30]:
print('Decision Tree - Classification Report')
print(classification_report(y_test, pred_tree, target_names=['No', 'Yes']))

Decision Tree - Classification Report
              precision    recall  f1-score   support

          No       0.85      0.77      0.80       171
         Yes       0.84      0.90      0.87       231

    accuracy                           0.84       402
   macro avg       0.84      0.83      0.83       402
weighted avg       0.84      0.84      0.84       402



### Tuning parameters to Improve Decision Trees Model

#### Train

In [31]:
# max_depth=2 and min_samples_leaf=5
tree_1 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=5, random_state=50)

# max_depth=5 and min_samples_leaf=4
tree_2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=4, random_state=50)

# max_depth=7 and min_samples_leaf=3
tree_3 = DecisionTreeClassifier(max_depth=7, min_samples_leaf=3, random_state=50)

In [32]:
tree_1.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=50, splitter='best')

In [33]:
tree_2.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=50, splitter='best')

In [34]:
tree_3.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=50, splitter='best')

#### Predict

In [35]:
pred_tree_1 = tree_1.predict(X_test)
pred_tree_2 = tree_2.predict(X_test)
pred_tree_3 = tree_3.predict(X_test)

#### Test

Confusion Matrix

In [36]:
labels = ['No', 'Yes']

pred = {'max_depth=2; min_samples_leaf=5':pred_tree_1, 
        'max_depth=5; min_samples_leaf=4':pred_tree_2, 
        'max_depth=7; min_samples_leaf=3':pred_tree_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

max_depth=2; min_samples_leaf=5
     No  Yes
No   31  140
Yes   5  226


max_depth=5; min_samples_leaf=4
      No  Yes
No   135   36
Yes   52  179


max_depth=7; min_samples_leaf=3
     No  Yes
No   91   80
Yes  28  203




Classification Report

In [37]:
pred = {'max_depth=2; min_samples_leaf=5':pred_tree_1, 
        'max_depth=5; min_samples_leaf=4':pred_tree_2, 
        'max_depth=7; min_samples_leaf=3':pred_tree_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

max_depth=2; min_samples_leaf=5
              precision    recall  f1-score   support

          No       0.86      0.18      0.30       171
         Yes       0.62      0.98      0.76       231

    accuracy                           0.64       402
   macro avg       0.74      0.58      0.53       402
weighted avg       0.72      0.64      0.56       402



max_depth=5; min_samples_leaf=4
              precision    recall  f1-score   support

          No       0.72      0.79      0.75       171
         Yes       0.83      0.77      0.80       231

    accuracy                           0.78       402
   macro avg       0.78      0.78      0.78       402
weighted avg       0.79      0.78      0.78       402



max_depth=7; min_samples_leaf=3
              precision    recall  f1-score   support

          No       0.76      0.53      0.63       171
         Yes       0.72      0.88      0.79       231

    accuracy                           0.73       402
   macro avg       0.74     

## Random Forests Model

### Training

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
# Our forest consists of 100 trees in this example
forest = RandomForestClassifier(n_estimators=100, random_state=23)

In [40]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=23, verbose=0,
                       warm_start=False)

### Predicting

In [41]:
pred_forest = forest.predict(X_test)

### Testing

Confusion Matrix

In [42]:
labels = ['No', 'Yes']

print('Random Forest - Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_forest), index=labels, columns=labels)

Random Forest - Confusion Matrix


Unnamed: 0,No,Yes
No,125,46
Yes,33,198


Classification Report

In [43]:
print('Random Forest - Classification Report')
print(classification_report(y_test, pred_forest, target_names=['No', 'Yes']))

Random Forest - Classification Report
              precision    recall  f1-score   support

          No       0.79      0.73      0.76       171
         Yes       0.81      0.86      0.83       231

    accuracy                           0.80       402
   macro avg       0.80      0.79      0.80       402
weighted avg       0.80      0.80      0.80       402



### Tuning parameters to Improve Random Forests Model

#### Train

In [44]:
# 5 trees in forest
forest_1 = RandomForestClassifier(n_estimators=5, random_state=23)

# 20 trees in forest
forest_2 = RandomForestClassifier(n_estimators=20, random_state=23)

# 100 trees in forest
forest_3 = RandomForestClassifier(n_estimators=100, random_state=23)

In [45]:
forest_1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=23, verbose=0,
                       warm_start=False)

In [46]:
forest_2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=23, verbose=0,
                       warm_start=False)

In [47]:
forest_3.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=23, verbose=0,
                       warm_start=False)

#### Predict

In [48]:
pred_forest_1 = forest_1.predict(X_test)
pred_forest_2 = forest_2.predict(X_test)
pred_forest_3 = forest_3.predict(X_test)

#### Testing

Confusion Matrix

In [49]:
labels = ['No', 'Yes']

pred = {'trees = 5':pred_forest_1, 
        'trees = 20':pred_forest_2, 
        'trees = 100':pred_forest_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

trees = 5
      No  Yes
No   105   66
Yes   51  180


trees = 20
      No  Yes
No   124   47
Yes   31  200


trees = 100
      No  Yes
No   125   46
Yes   33  198




Classification Report

In [50]:
pred = {'trees = 5':pred_forest_1, 
        'trees = 20':pred_forest_2, 
        'trees = 100':pred_forest_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

trees = 5
              precision    recall  f1-score   support

          No       0.67      0.61      0.64       171
         Yes       0.73      0.78      0.75       231

    accuracy                           0.71       402
   macro avg       0.70      0.70      0.70       402
weighted avg       0.71      0.71      0.71       402



trees = 20
              precision    recall  f1-score   support

          No       0.80      0.73      0.76       171
         Yes       0.81      0.87      0.84       231

    accuracy                           0.81       402
   macro avg       0.80      0.80      0.80       402
weighted avg       0.81      0.81      0.80       402



trees = 100
              precision    recall  f1-score   support

          No       0.79      0.73      0.76       171
         Yes       0.81      0.86      0.83       231

    accuracy                           0.80       402
   macro avg       0.80      0.79      0.80       402
weighted avg       0.80      0.80    

## Support Vector Machines Models

### Training

In [51]:
from sklearn.svm import SVC

In [52]:
svm = SVC(random_state=23)

In [53]:
svm.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=23,
    shrinking=True, tol=0.001, verbose=False)

### Predicting

In [54]:
pred_svm = svm.predict(X_test)

### Testing

Confusion Matrix

In [55]:
labels = ['No', 'Yes']

print('SVM - Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_svm), index=labels, columns=labels)

SVM - Confusion Matrix


Unnamed: 0,No,Yes
No,55,116
Yes,19,212


Classification Report

In [56]:
print('SVM - Classification Report')
print(classification_report(y_test, pred_svm, target_names=['No', 'Yes']))

SVM - Classification Report
              precision    recall  f1-score   support

          No       0.74      0.32      0.45       171
         Yes       0.65      0.92      0.76       231

    accuracy                           0.66       402
   macro avg       0.69      0.62      0.60       402
weighted avg       0.69      0.66      0.63       402



### Tuning parameters to Improve SVM Model

#### Train

In [57]:
# kernel=linear, C=1, gamma=0.1
svm_1 = SVC(kernel='linear', C=1, gamma=0.1, random_state=23)

# kernel=rbf, C=10, gamma=0.01
svm_2 = SVC(kernel='rbf', C=10, gamma=0.01, random_state=23)

# kernel=poly, C=100, gamma=1
svm_3 = SVC(kernel='poly', C=100, gamma=1, random_state=23)

In [58]:
svm_1.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
    max_iter=-1, probability=False, random_state=23, shrinking=True, tol=0.001,
    verbose=False)

In [59]:
svm_2.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=23, shrinking=True, tol=0.001,
    verbose=False)

In [60]:
svm_3.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='poly',
    max_iter=-1, probability=False, random_state=23, shrinking=True, tol=0.001,
    verbose=False)

#### Predict

In [61]:
pred_svm_1 = svm_1.predict(X_test)
pred_svm_2 = svm_2.predict(X_test)
pred_svm_3 = svm_3.predict(X_test)

#### Test

Confusion Matrix

In [62]:
labels = ['No', 'Yes']

pred = {'kernel=linear, C=1, gamma=0.1':pred_svm_1, 
        'kernel=rbf, C=10, gamma=0.01':pred_svm_2, 
        'kernel=poly, C=100, gamma=1':pred_svm_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

kernel=linear, C=1, gamma=0.1
      No  Yes
No   118   53
Yes   54  177


kernel=rbf, C=10, gamma=0.01
     No  Yes
No   27  144
Yes  19  212


kernel=poly, C=100, gamma=1
     No  Yes
No   56  115
Yes  30  201




Classification Report

In [63]:
pred = {'kernel=linear, C=1, gamma=0.1':pred_svm_1, 
        'kernel=rbf, C=10, gamma=0.01':pred_svm_2, 
        'kernel=poly, C=100, gamma=1':pred_svm_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

kernel=linear, C=1, gamma=0.1
              precision    recall  f1-score   support

          No       0.69      0.69      0.69       171
         Yes       0.77      0.77      0.77       231

    accuracy                           0.73       402
   macro avg       0.73      0.73      0.73       402
weighted avg       0.73      0.73      0.73       402



kernel=rbf, C=10, gamma=0.01
              precision    recall  f1-score   support

          No       0.59      0.16      0.25       171
         Yes       0.60      0.92      0.72       231

    accuracy                           0.59       402
   macro avg       0.59      0.54      0.49       402
weighted avg       0.59      0.59      0.52       402



kernel=poly, C=100, gamma=1
              precision    recall  f1-score   support

          No       0.65      0.33      0.44       171
         Yes       0.64      0.87      0.73       231

    accuracy                           0.64       402
   macro avg       0.64      0.60    

## Logistic Regression Model

### Training

In [64]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()
lm.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
lm.intercept_[0]
coeff_df = pd.DataFrame(lm.coef_.T,X_transformed.columns,columns=['Coefficient'])

### Predict

In [66]:
pred_lm = lm.predict(X_test)

### Testing

In [67]:
from sklearn.metrics import confusion_matrix
labels = ['No', 'Yes']

print('Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_lm), index=labels, columns=labels)

Confusion Matrix


Unnamed: 0,No,Yes
No,120,51
Yes,56,175


### Classification

In [68]:
from sklearn.metrics import classification_report
print('Classification Report')
print(classification_report(y_test, pred_lm, target_names=['No', 'Yes']))

Classification Report
              precision    recall  f1-score   support

          No       0.68      0.70      0.69       171
         Yes       0.77      0.76      0.77       231

    accuracy                           0.73       402
   macro avg       0.73      0.73      0.73       402
weighted avg       0.73      0.73      0.73       402



# Conclusion

## Decision Tree is the best model according to F1 Score.

### Creating a combined dataframe

In [69]:
sql_query_claims = "Select * from claims_data"
sql_query_test = "Select * from test_set"

df_claims = pd.read_sql_query(sql_query_claims, connection)
df_test = pd.read_sql_query(sql_query_test, connection)

df_claims_no_labels = df_claims.drop(['insurance_claim', 'claim_amount'], axis=1)
df_test_no_index = df_test.drop(['row_id'], axis=1)

df_combined = pd.concat([df_claims_no_labels, df_test_no_index],sort=False).reset_index(drop=True)

### Pre-processing

In [70]:
#y-label
y = df_claims['insurance_claim']

#x-label
X = df_claims.drop(['insurance_claim', 'claim_amount'], axis=1)

#retaining the index row
r = df_test['row_id']

In [71]:
#transforming features

combined_transformed = pd.get_dummies(df_combined, drop_first=True)

X_transformed = combined_transformed.iloc[:len(y), :]
test_transformed = combined_transformed.iloc[len(y):, :]

### Training using the Decision Tree

In [72]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=47)

In [73]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=50)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=50, splitter='best')

### Predicting

In [74]:
pred_tree = tree.predict(X_test)

### Testing using the Decision Tree

In [75]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

labels = ['No', 'Yes']

In [76]:
#Confusion Matrix
print('Decision Tree Confusion Matrix')
pd.DataFrame(data=confusion_matrix(y_test, pred_tree), index=labels, columns=labels)

Decision Tree Confusion Matrix


Unnamed: 0,No,Yes
No,80,22
Yes,13,153


In [77]:
from sklearn.metrics import classification_report

#Classification report
print('Decision Tree - Classification Report')
print(classification_report(y_test, pred_tree, target_names=['No', 'Yes']))

Decision Tree - Classification Report
              precision    recall  f1-score   support

          No       0.86      0.78      0.82       102
         Yes       0.87      0.92      0.90       166

    accuracy                           0.87       268
   macro avg       0.87      0.85      0.86       268
weighted avg       0.87      0.87      0.87       268



### Tuning Parameters

In [78]:
tree_0 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, random_state=50)

tree_1 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=5, random_state=50)

tree_2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=4, random_state=50)

tree_3 = DecisionTreeClassifier(max_depth=7, min_samples_leaf=3, random_state=50)

tree_0.fit(X_train, y_train)
tree_1.fit(X_train, y_train)
tree_2.fit(X_train, y_train)
tree_3.fit(X_train, y_train)

pred_tree_0 = tree_1.predict(X_test)
pred_tree_1 = tree_1.predict(X_test)
pred_tree_2 = tree_2.predict(X_test)
pred_tree_3 = tree_3.predict(X_test)

pred = {'max_depth=5; min_samples_leaf=5':pred_tree_0, 
        'max_depth=2; min_samples_leaf=5':pred_tree_1, 
        'max_depth=5; min_samples_leaf=4':pred_tree_2, 
        'max_depth=7; min_samples_leaf=3':pred_tree_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(pd.DataFrame(data=confusion_matrix(y_test, v), index=labels, columns=labels))
    print('\n')

pred = {'max_depth=5; min_samples_leaf=5':pred_tree_0, 
        'max_depth=2; min_samples_leaf=5':pred_tree_1, 
        'max_depth=5; min_samples_leaf=4':pred_tree_2, 
        'max_depth=7; min_samples_leaf=3':pred_tree_3}

for k,v in pred.items():
    print('{}'.format(k))
    print(classification_report(y_test, v, target_names=['No', 'Yes']))
    print('\n')

max_depth=5; min_samples_leaf=5
     No  Yes
No   19   83
Yes   6  160


max_depth=2; min_samples_leaf=5
     No  Yes
No   19   83
Yes   6  160


max_depth=5; min_samples_leaf=4
     No  Yes
No   78   24
Yes  34  132


max_depth=7; min_samples_leaf=3
     No  Yes
No   53   49
Yes  18  148


max_depth=5; min_samples_leaf=5
              precision    recall  f1-score   support

          No       0.76      0.19      0.30       102
         Yes       0.66      0.96      0.78       166

    accuracy                           0.67       268
   macro avg       0.71      0.58      0.54       268
weighted avg       0.70      0.67      0.60       268



max_depth=2; min_samples_leaf=5
              precision    recall  f1-score   support

          No       0.76      0.19      0.30       102
         Yes       0.66      0.96      0.78       166

    accuracy                           0.67       268
   macro avg       0.71      0.58      0.54       268
weighted avg       0.70      0.67      0.60

In [79]:
test_predict_tree = tree.predict(test_transformed)
test_predict_tree

array(['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes',
       'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'no',
       'yes', 'yes', 'no', 'yes', 'yes', 'no', 'yes', 'no', 'no', 'yes',
       'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes',
       'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no',
       'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes',
       'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', '

In [81]:
type(r)
type(test_predict_tree)

numpy.ndarray

In [83]:
FinalModel = pd.DataFrame({'row_id':r.values,'prediction': test_predict_tree})
FinalModel.head(50)

Unnamed: 0,row_id,prediction
0,1,yes
1,2,yes
2,3,yes
3,4,yes
4,5,yes
5,6,yes
6,7,yes
7,8,yes
8,9,yes
9,10,yes


## Sending The Database to SQL

In [84]:
FinalModel.to_sql('classification_model_predict', con=connection, if_exists='append', index=True,method='multi',chunksize=500)

In [85]:
#Review tables in Database
connection.table_names()

['claims_data', 'classification_model_predict', 'football_players', 'test_set']