In [3]:
import pandas as pd
import numpy as np
import itertools 
import warnings
from sklearn.model_selection import KFold
import sklearn.metrics as metric
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from scipy.io import arff
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression

In [2]:
data = arff.loadarff('TimeBasedFeatures-Dataset-15s-VPN.arff')
df = pd.DataFrame(data[0])
df = shuffle(df)
df.head()
y =df['class1'].replace({b'VPN': 1,b'Non-VPN': 0})
X = df.drop(["class1"],axis=1)
X_A, X_B, y_A, y_B = train_test_split(X,y, test_size=0.7, random_state=3)
kfold= KFold(n_splits=10)

In [7]:
clf1 = LogisticRegression()
import warnings
warnings.filterwarnings('ignore')
c=[x*0.1 for x in range(1, 10)]
c.append(0.01)
params={ 'C':c,'penalty': ['l1', 'l2', 'elasticnet','none'],
        'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
         'class_weight': [{0: 0.01}, {1: 1},'none','balanced']}
from sklearn.model_selection import RandomizedSearchCV
for train_index, val_index in kfold.split(X_A):
        output_val=[]
        output_train=[]

        X_train, X_val = X_A.iloc[train_index], X_A.iloc[val_index]
        y_train, y_val = y_A.iloc[train_index], y_A.iloc[val_index]  

        randomclf = RandomizedSearchCV(estimator=clf1, param_distributions=params,
                               n_iter=len(params), cv=4, verbose=0, random_state=42,
                               n_jobs=1)

        search=randomclf.fit(X_train, y_train)
        print(search.best_params_)

{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'liblinear', 'penalty': 'l1', 'class_weight': {1: 1}, 'C': 0.2}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}
{'solver': 'newton-cg', 'penalty': 'none', 'class_weight': {1: 1}, 'C': 0.6000000000000001}


In [8]:
cf = LogisticRegression(solver= 'newton-cg', penalty = 'none', class_weight= {1: 1}, C= 0.6000000000000001)

In [9]:
X_A, X_B, y_A, y_B = train_test_split(X,y, test_size=0.7, random_state=3)
kfold= KFold(n_splits=10)
AccuracyValidation = []
AccuracyTraining = []
precisionValidation = []
precisionTraining = []
recallValidation = []
recallTraining = []
f1scoreValidation = []
f1scoreTraining = []
sensitivityValidation = []
sensitivityTraining = []
specificityValidation = []
specificityTraining = []

for train_index, val_index in kfold.split(X_A):
        output_val=[]
        output_train=[]

        X_train, X_val = X_A.iloc[train_index], X_A.iloc[val_index]
        y_train, y_val = y_A.iloc[train_index], y_A.iloc[val_index]  
        cf.fit(X_train,y_train)
        output_train =  cf.predict(X_train)
        output_val =  cf.predict(X_val)
        
        AccuracyTraining.append((accuracy_score(y_train,output_train)))
        AccuracyValidation.append((accuracy_score(y_val,output_val)))
        precisionTraining.append(precision_score(y_train,output_train))
        precisionValidation.append(precision_score(y_val,output_val))
        recallTraining.append(recall_score(y_train,output_train))
        recallValidation.append(recall_score(y_val,output_val))
        cmVal = confusion_matrix(y_val,output_val)
        cmTrain= confusion_matrix(y_train,output_train)
        sensitivityValidation.append(cmVal[0,0]/(cmVal[0,0]+cmVal[0,1]))
        sensitivityTraining.append(cmTrain[0,0]/(cmTrain[0,0]+cmTrain[0,1]))
        specificityValidation.append( cmVal[1,1]/(cmVal[1,0]+cmVal[1,1]))
        specificityTraining.append( cmTrain[1,1]/(cmTrain[1,0]+cmTrain[1,1]))
        f1scoreValidation.append( (2 * precision_score(y_val,output_val) * recall_score(y_val,output_val)) / (precision_score(y_val,output_val) + recall_score(y_val,output_val)))
        f1scoreTraining.append( (2 * precision_score(y_train,output_train) * recall_score(y_train,output_train)) / (precision_score(y_train,output_train) + recall_score(y_train,output_train)))

print("Accuracy of training for all folds:{}%".format((sum(AccuracyTraining)/10)))
print("Accuracy of validation for all folds:{}%".format((sum(AccuracyValidation)/10)))

print("Precision of training for all folds:{}%".format((sum(precisionTraining)/10)))
print("Precision of validation for all folds:{}%".format((sum(precisionValidation)/10)))

print("Recall of training for all folds:{}%".format((sum(recallTraining)/10)))
print("Recall of validation for all folds:{}%".format((sum(recallValidation)/10)))

print("sensitivity of training for all folds:{}%".format((sum(sensitivityTraining)/10)))
print("sensitivity of validation for all folds:{}%".format((sum(sensitivityValidation)/10)))

print("specificity of training for all folds:{}%".format((sum(specificityTraining)/10)))
print("specificity of validation for all folds:{}%".format((sum(specificityValidation)/10)))

print("F1 Score of training for all folds:{}%".format((sum(f1scoreTraining)/10)))
print("F1 Score of validation for all folds:{}%".format((sum(f1scoreValidation)/10)))

output_test =  cf.predict(X_B)

accuracy_test = accuracy_score(y_B,output_test)
precison_test = precision_score(y_B,output_test)
recall_test = recall_score(y_B,output_test)
cmTest = confusion_matrix(y_B,output_test)
sensitivityTest = cmTest[0,0]/(cmTest[0,0]+cmTest[0,1])
specificityTest = cmTest[1,1]/(cmTest[1,0]+cmTest[1,1])
f1_test = ( (2 * precison_test* recall_test) / (precison_test + recall_test))


Accuracy of training for all folds:0.6401872147080061%
Accuracy of validation for all folds:0.6339124416098304%
Precision of training for all folds:0.6256372575744614%
Precision of validation for all folds:0.6202628545428063%
Recall of training for all folds:0.7619150380070817%
Recall of validation for all folds:0.7587940761601126%
sensitivity of training for all folds:0.5091197264586225%
sensitivity of validation for all folds:0.4998490818066143%
specificity of training for all folds:0.7619150380070817%
specificity of validation for all folds:0.7587940761601126%
F1 Score of training for all folds:0.6870049191682733%
F1 Score of validation for all folds:0.6822874278607778%


In [10]:
table_f1 = pd.DataFrame({'Training f1': f1scoreTraining, 'Validation f1 ':  f1scoreValidation})
table_f1

Unnamed: 0,Training f1,Validation f1
0,0.691312,0.664625
1,0.679088,0.690852
2,0.679081,0.670807
3,0.692438,0.670866
4,0.691996,0.697108
5,0.682258,0.660606
6,0.688417,0.686709
7,0.690102,0.689864
8,0.687942,0.681957
9,0.687415,0.70948


In [11]:
table_precision= pd.DataFrame({'Training precision': precisionTraining, 'Validation precision ':  precisionValidation})
table_precision

Unnamed: 0,Training precision,Validation precision
0,0.626041,0.594521
1,0.625401,0.640351
2,0.62512,0.613636
3,0.624847,0.612069
4,0.623175,0.612299
5,0.630689,0.615819
6,0.631612,0.618234
7,0.624652,0.621253
8,0.626453,0.635328
9,0.618381,0.639118


In [12]:
table_Recall= pd.DataFrame({'Training Recall': recallTraining, 'Validation Recall':  recallValidation})
table_Recall

Unnamed: 0,Training Recall,Validation Recall
0,0.771776,0.753472
1,0.742857,0.75
2,0.743238,0.739726
3,0.776426,0.74216
4,0.777904,0.809187
5,0.74301,0.712418
6,0.756449,0.772242
7,0.770873,0.77551
8,0.762816,0.735974
9,0.7738,0.797251


In [13]:
table_Sensitivity= pd.DataFrame({'Training Sensitivity': sensitivityTraining, 'Validation Sensitivity':  sensitivityValidation})
table_Sensitivity

Unnamed: 0,Training Sensitivity,Validation Sensitivity
0,0.502259,0.461818
1,0.521115,0.546125
2,0.520295,0.498155
3,0.496302,0.51087
4,0.490123,0.482143
5,0.536894,0.470817
6,0.521005,0.524823
7,0.502457,0.481343
8,0.514892,0.505792
9,0.485855,0.516605


In [14]:
table_specificity= pd.DataFrame({'Training specificity': specificityTraining, 'Validation specificity':  specificityValidation})
table_specificity

Unnamed: 0,Training specificity,Validation specificity
0,0.771776,0.753472
1,0.742857,0.75
2,0.743238,0.739726
3,0.776426,0.74216
4,0.777904,0.809187
5,0.74301,0.712418
6,0.756449,0.772242
7,0.770873,0.77551
8,0.762816,0.735974
9,0.7738,0.797251


In [15]:
table_accuracy = pd.DataFrame({'Training Accuracy': AccuracyTraining, 'Validation Accuracy ':  AccuracyValidation})
table_accuracy

Unnamed: 0,Training Accuracy,Validation Accuracy
0,0.64218,0.611012
1,0.636058,0.651865
2,0.635861,0.623446
3,0.641785,0.628774
4,0.63981,0.646536
5,0.643167,0.602131
6,0.643562,0.648313
7,0.641461,0.635231
8,0.642843,0.629893
9,0.635143,0.661922


In [16]:
print(accuracy_test)
print(precison_test )
print(recall_test)
print(cmTest )
print(sensitivityTest )
print(specificityTest )
print(f1_test )


0.6237148732008225
0.6146191209572326
0.7545084351367074
[[3002 3253]
 [1688 5188]]
0.47993605115907273
0.7545084351367074
0.6774172488085134
