In [2]:
import geopy.distance
import pandas as pd
import os
import datetime

In [23]:
data_in_out = pd.read_csv('../ionosphere_dataset/NOAA/NOAA_datasets_for_ML/datasets_LR_model/flattened_ds_sondes_in_out_7days.csv')
data_before_after = pd.read_csv('../ionosphere_dataset/NOAA/NOAA_datasets_for_ML/datasets_LR_model/flattened_ds_y_ago_y_after_7days.csv')

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

def prepare_dataset(df):
    ds_data = df.drop(columns = ['res']).to_numpy()
    ds_target = df['res'].to_numpy()
    x_train, x_test, y_train, y_test = train_test_split(ds_data, ds_target, test_size=0.2, random_state=42)
    sc = StandardScaler()
    X_train = sc.fit_transform(x_train)
    X_test = sc.transform(x_test)
    return X_train, X_test, y_train, y_test

def calculate_metrix(x_test ,y_test, model, average='weighted'):
    print('==================================')
    y_pred = model.predict(x_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('TN=', tn, '\n',
          'FP=', fp, '\n',
          'FN=', fn, '\n',
          'TP=', tp, '\n')
    
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=average)
    print('precision=', p, '\n',
          'recall=', r, '\n',
          'F score=', f1, '\n')

def compare_models(df):
    x_train, x_test, y_train, y_test = prepare_dataset(df)
    
    # model 1
    model_log_reg= LogisticRegression(max_iter=10000, tol=0.1)
    acc_log_reg = []

    # model 2
    pca = PCA(n_components=64)
    logistic = LogisticRegression( max_iter=10000,tol=0.1)
    pipe_pca = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
    acc_pca = []

    # model 3
    model_log_reg_l2 = LogisticRegression(penalty='l2', C=100, solver='liblinear',  max_iter=10000, tol=0.1)
    acc_l2 = []
    
    # cross-fold validation
    
    kf = KFold(n_splits=10)
    kf.get_n_splits(x_train)
    for train_index, test_index in kf.split(x_train):
        X_train, X_test = x_train[train_index], x_train[test_index]
        Y_train, Y_test = y_train[train_index], y_train[test_index]
        model_log_reg.fit(X_train, Y_train)
        pipe_pca.fit(X_train, Y_train)
        model_log_reg_l2.fit(X_train, Y_train)
        acc_log_reg.append(model_log_reg.score(X_test, Y_test))
        acc_pca.append(pipe_pca.score(X_test, Y_test))
        acc_l2.append(model_log_reg_l2.score(X_test, Y_test))
    
    print('Log reg test acc= ', accuracy_score(y_test, model_log_reg.predict(x_test)))
    print('PCA test acc = ', accuracy_score(y_test, pipe_pca.predict(x_test)))
    print('L2 test acc = ', accuracy_score(y_test, model_log_reg_l2.predict(x_test)))
    print('\n')
    print('Log reg test f1= ', f1_score(y_test, model_log_reg.predict(x_test), average='weighted'))
    print('PCA test f1 = ', f1_score(y_test, pipe_pca.predict(x_test), average='weighted'))
    print('L2 test f1 = ', f1_score(y_test, model_log_reg_l2.predict(x_test), average='weighted'))
    
    print('\n')
    
    print('Model Log reg')
    calculate_metrix(x_test, y_test, model_log_reg)
    print('\n')
    print('Model PCA + Log reg')
    calculate_metrix(x_test, y_test, pipe_pca)
    print('\n')
    print('Model Log reg + L2')
    calculate_metrix(x_test, y_test, model_log_reg_l2)
    
    return acc_log_reg, acc_pca, acc_l2


In [20]:
from scipy.stats import ttest_ind
import numpy as np
from scipy import stats

def compare_samples_t_test(s1, s2):
    m1_mean = np.mean(s1)
    m2_mean = np.mean(s2)
    print("model1 mean value:",m1_mean)
    print("model2 mean value:",m2_mean)
    m1_std = np.std(s1)
    m2_std = np.std(s2)
    print("model1 std value:", m1_std)
    print("model2 std value:", m2_std)
    print('\n')
    ttest,pval = ttest_ind(s1, s2)
    # ttest,pval = stats.ttest_rel(test_acc,test_acc_pca)
    print("p-value",pval)
    if pval < 0.05:
        print("we reject null hypothesis")
    else:
        print("we accept null hypothesis")

In [24]:
acc_log_reg1, acc_pca1, acc_l2_1 = compare_models(data_before_after)

Log reg test acc=  0.7716701902748414
PCA test acc =  0.828752642706131
L2 test acc =  0.7801268498942917


Log reg test f1=  0.774305872897615
PCA test f1 =  0.8238797148693985
L2 test f1 =  0.7814530495890912


Model Log reg
TN= 274 
 FP= 60 
 FN= 48 
 TP= 91 

precision= 0.7779691163736858 
 recall= 0.7716701902748414 
 F score= 0.774305872897615 



Model PCA + Log reg
TN= 305 
 FP= 29 
 FN= 52 
 TP= 87 

precision= 0.8236789430359881 
 recall= 0.828752642706131 
 F score= 0.8238797148693985 



Model Log reg + L2
TN= 279 
 FP= 55 
 FN= 49 
 TP= 90 

precision= 0.7830433839440822 
 recall= 0.7801268498942917 
 F score= 0.7814530495890912 



In [21]:
compare_samples_t_test(acc_log_reg1, acc_pca1)

model1 mean value: 0.8092795593368237
model2 mean value: 0.8426347076788832
model1 std value: 0.027943270603251302
model2 std value: 0.02061824336024713


p-value 0.009933816672920258
we reject null hypothesis


In [22]:
acc_log_reg2, acc_pca2, acc_l2_2 =  compare_models(data_in_out)
compare_samples_t_test(acc_log_reg1, acc_pca1)

Log reg test acc=  0.8767772511848341
PCA test acc =  0.8957345971563981
L2 test acc =  0.8483412322274881


Log reg test f1=  0.877727652887954
PCA test f1 =  0.8948147386916852
L2 test f1 =  0.8500361327246394


Model Log reg
TN= 55 
 FP= 11 
 FN= 15 
 TP= 130 

precision= 0.8793605977230058 
 recall= 0.8767772511848341 
 F score= 0.877727652887954 



Model PCA + Log reg
TN= 53 
 FP= 13 
 FN= 9 
 TP= 136 

precision= 0.8946367209824657 
 recall= 0.8957345971563981 
 F score= 0.8948147386916852 



Model Log reg + L2
TN= 53 
 FP= 13 
 FN= 19 
 TP= 126 

precision= 0.8531856978871879 
 recall= 0.8483412322274881 
 F score= 0.8500361327246394 

model1 mean value: 0.8092795593368237
model2 mean value: 0.8426347076788832
model1 std value: 0.027943270603251302
model2 std value: 0.02061824336024713


p-value 0.009933816672920258
we reject null hypothesis
