# Dataset

In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('always')

In [182]:
# load the final dataframe from data_prep 
df_fulldata = pd.read_csv('diabetic_data_df.csv')

In [183]:
# Load columns to use from data_prep
col2use = pd.read_csv('col2use.csv')
col2use = col2use['col2use'].tolist()

df_data = df_fulldata[col2use]

In [184]:
df_data.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,med_spec_InternalMedicine,med_spec_Nephrology,med_spec_Orthopedics,med_spec_Orthopedics-Reconstructive,med_spec_Other,med_spec_Radiologist,med_spec_Surgery-General,med_spec_UNK,age_group,has_weight
0,1,41,0,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
1,3,59,0,18,0,0,0,9,0,1,...,0,0,0,0,0,0,0,1,10,0
2,2,11,5,13,2,0,1,6,0,0,...,0,0,0,0,0,0,0,1,20,0
3,2,44,1,16,0,0,0,7,0,1,...,0,0,0,0,0,0,0,1,30,0
4,1,51,0,8,0,0,0,5,0,1,...,0,0,0,0,0,0,0,1,40,0


In [185]:
# shuffle the samples
df_data = df_data.sample(n = len(df_data), random_state = 42)
df_data = df_data.reset_index(drop = True)

In [186]:
#split dataset 
# -> stratified split
# -> 80% train
#    -> k-fold (k=4)
# -> 20% test
#    -> confusion matrix
#    -> accuracy
#    -> sensitivity
#    -> recall
X   = df_data.values #input
Y = df_fulldata['OUTPUT_LABEL'].values

Y_o_no  = np.where(Y_o == 0)[0]
Y_o_yes = np.where(Y_o == 1)[0]

class_names = {1:'yes', 0:'no'}
names = ['no', 'yes']

print ('==> Size classes:')
for c in np.unique(Y_o):
    s = np.sum(Y_o == c).astype(np.float)
    print ('\t{:4} [{:1}] => {:5} => {:3}'.format(class_names[c], c, s, round(s/len(Y_o), 3)))

#split dataset: train and test
X_train, X_test, Y_train,  Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, stratify=Y) 

print ('==> Dataset split:')
print ('\ttrain => input={} | output={}'.format(X_train.shape, Y_train.shape))
print ('\ttest  => input={} | output={}'.format(X_test.shape, Y_test.shape))


==> Size classes:
	no   [0] => 88029.0 => 0.886
	yes  [1] => 11314.0 => 0.114
==> Dataset split:
	train => input=(79474, 143) | output=(79474,)
	test  => input=(19869, 143) | output=(19869,)


# Methods

In [187]:
from sklearn.metrics import classification_report, accuracy_score, \
                            average_precision_score, f1_score, precision_score,\
                            recall_score, roc_auc_score, log_loss

def report(y_true, y_pred, prefix):
    accuracy = accuracy_score(y_true, y_pred)
    ap      = average_precision_score(y_true, y_pred)
    f1      = f1_score(y_true, y_pred)
    lloss   = log_loss(y_true, y_pred)
    prec    = precision_score(y_true, y_pred)
    recall  = recall_score(y_true, y_pred)
    auc     = roc_auc_score(y_true, y_pred)
    
    print ('{} Accuracy :{:3}'.format(prefix, accuracy))
    print ('{} AP:       {:3}'.format(prefix, ap))
    print ('{} F1-score :{:3}'.format(prefix, f1))
    print ('{} Log-Loss :{:3}'.format(prefix, lloss))
    print ('{} Precision:{:3}'.format(prefix, prec))
    print ('{} Recall   :{:3}'.format(prefix, recall))
    print ('{} AUC      :{:3}'.format(prefix, auc))   

# Linear SVM

In [188]:
from sklearn.svm import LinearSVC
import pickle

# penalty     = l2
# loss        = hinge
# dual        = False
# tol         = 1e-6
# C           = 1.0
# multi_class = ovr
# verbose     = 0
# random_state= 42
# max_iter    = 1e6
linear_svm = LinearSVC(dual=False, tol=1e-6, random_state=42, max_iter=1e5)

### training

In [190]:
import warnings
warnings.filterwarnings('always')

kfold = StratifiedKFold(n_splits=4, random_state=None, shuffle=True)

print ('==> training Linear SVM:')
iteration = 0
for itrain, ivalid in kfold.split(X_train, Y_train):
    print ('\t\033[94m iteration {}\033[0m'.format(iteration))
    
    #split dataset into train and valid, based on folds
    xi_train, xi_valid = X_train[itrain], X_train[ivalid]
    yi_train, yi_valid = Y_train[itrain], Y_train[ivalid]
    print ('\t\t train size: input= {} | output= {}'.format(xi_train.shape, yi_train.shape))
    print ('\t\t valid size: input= {} | output= {}'.format(xi_valid.shape, yi_valid.shape))
        
    #train
    linear_svm.fit(xi_train, yi_train)
    
    #valid
    y_train_predicted = linear_svm.predict(xi_train) 
    y_valid_predicted = linear_svm.predict(xi_valid)
    
    #report
    print ('\t\t report train:')
    report(y_true=yi_train, y_pred=y_train_predicted, prefix='\t\t\t')
    print ('\t\t report valid:')
    report(y_true=yi_valid, y_pred=y_valid_predicted, prefix='\t\t\t')
    
    with open('models/svm/linear_svm_{}'.format(iteration), 'wb') as f:
        f.write(pickle.dumps(linear_svm))
    
    
    iteration += 1
    
    

==> training Linear SVM:
	[94m iteration 0[0m
		 train size: input= (59605, 143) | output= (59605,)
		 valid size: input= (19869, 143) | output= (19869,)
		 report train:
			 Accuracy :0.8861169364986159
			 AP:       0.11388306350138411
			 F1-score :0.0
			 Log-Loss :3.9333816654417206
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
		 report valid:
			 Accuracy :0.8861039810760482
			 AP:       0.11389601892395189
			 F1-score :0.0
			 Log-Loss :3.93382912988489
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
	[94m iteration 1[0m
		 train size: input= (59605, 143) | output= (59605,)
		 valid size: input= (19869, 143) | output= (19869,)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


		 report train:
			 Accuracy :0.8861169364986159
			 AP:       0.11388306350138411
			 F1-score :0.0
			 Log-Loss :3.93338166544172
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
		 report valid:
			 Accuracy :0.8861039810760482
			 AP:       0.11389601892395189
			 F1-score :0.0
			 Log-Loss :3.93382912988489
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
	[94m iteration 2[0m
		 train size: input= (59605, 143) | output= (59605,)
		 valid size: input= (19869, 143) | output= (19869,)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


		 report train:
			 Accuracy :0.8861169364986159
			 AP:       0.11388306350138411
			 F1-score :0.0
			 Log-Loss :3.93338166544172
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
		 report valid:
			 Accuracy :0.8861039810760482
			 AP:       0.11389601892395189
			 F1-score :0.0
			 Log-Loss :3.93382912988489
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
	[94m iteration 3[0m
		 train size: input= (59607, 143) | output= (59607,)
		 valid size: input= (19867, 143) | output= (19867,)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


		 report train:
			 Accuracy :0.8861039810760482
			 AP:       0.11389601892395189
			 F1-score :0.0
			 Log-Loss :3.93382912988489
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5
		 report valid:
			 Accuracy :0.8861428499521821
			 AP:       0.11385715004781799
			 F1-score :0.0
			 Log-Loss :3.93248664646338
			 Precision:0.0
			 Recall   :0.0
			 AUC      :0.5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### test

# Sigmoid SVM

### training

### test

# Poly SVM

### training

### test

# RBF SVM

### training

### test