# Testing the logistic regression using various real-world data sets.

## Author: Bojian Xu, bojianxu@ewu.edu

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logistic_regression as logic
import sys
sys.path.append('..')
from code_misc.utils import MyUtils

In [2]:
data_set = 'ionosphere'

print(data_set+'/'+'hello')

ionosphere/hello


In [3]:
# READ in data
df_X_train = pd.read_csv(data_set+'/'+'X_train.csv', header=None)
df_y_train = pd.read_csv(data_set+'/'+'y_train.csv', header=None)
df_X_test = pd.read_csv(data_set+'/'+'X_test.csv', header=None)
df_y_test = pd.read_csv(data_set+'/'+'y_test.csv', header=None)

# save in numpy arrays
X_train = MyUtils.normalize_0_1(df_X_train.to_numpy())
y_train = MyUtils.normalize_0_1(df_y_train.to_numpy())
X_test = MyUtils.normalize_0_1(df_X_test.to_numpy())
y_test = MyUtils.normalize_0_1(df_y_test.to_numpy())

# get training set size
n_train = X_train.shape[0]

# normalize all features to [0,1] or [-1,1]
if data_set == 'ionosphere':
    X_all = MyUtils.normalize_neg1_pos1(np.concatenate((X_train, X_test), axis=0))


X_train = X_all[:n_train]
X_test = X_all[n_train:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
#print(y_test)

(280, 34)
(280, 1)
(71, 34)
(71, 1)


In [4]:
#print(y_train[-10:])

In [5]:
# build the model
log = logic.LogisticRegression()

In [6]:
# train the model
log.fit(X_train, y_train, lam = 0.1, eta = 0.01, iterations = 10000, SGD = False, mini_batch_size = 20, degree = 3)
#log.fit(X_train, y_train, lam = 0, eta = 0.1, iterations = 50000, SGD = True, mini_batch_size = 20, degree = 3)
print(log.w)

[[ 0.42981108]
 [ 0.42981108]
 [ 0.        ]
 ...
 [-0.00114915]
 [-0.00142066]
 [-0.00307851]]


In [7]:
print('misclassfied percentage from training: ', log.error(X_train, y_train)/X_train.shape[0])
print('misclassfied percentage from validation: ', log.error(X_test, y_test)/X_test.shape[0])

misclassfied percentage from training:  0.31785714285714284
misclassfied percentage from validation:  0.36619718309859156


In [8]:
preds = log.predict(X_test)
print(preds)

[[1.        ]
 [0.9999881 ]
 [0.90501521]
 [0.98174357]
 [0.99995874]
 [1.        ]
 [0.99999978]
 [0.99993037]
 [0.9060508 ]
 [0.93344188]
 [0.99999109]
 [1.        ]
 [0.9770123 ]
 [1.        ]
 [1.        ]
 [0.41116895]
 [0.99999517]
 [0.9017764 ]
 [0.99877627]
 [0.99976421]
 [0.84948193]
 [0.61436632]
 [0.99999003]
 [0.99999997]
 [0.99998798]
 [0.99986372]
 [0.99989326]
 [0.99997583]
 [0.99999766]
 [0.99999871]
 [0.72463747]
 [0.99997569]
 [0.99839502]
 [1.        ]
 [1.        ]
 [1.        ]
 [0.99999996]
 [1.        ]
 [0.84263677]
 [0.99999998]
 [0.83965089]
 [1.        ]
 [0.92076586]
 [0.99979136]
 [0.9999712 ]
 [0.99709958]
 [0.99998689]
 [0.99998258]
 [0.99996524]
 [0.85482856]
 [0.99997705]
 [0.99998246]
 [0.98396321]
 [0.99999887]
 [0.9998857 ]
 [0.99999941]
 [0.96669597]
 [0.99929008]
 [0.99999546]
 [0.9999999 ]
 [0.99989333]
 [1.        ]
 [0.99786212]
 [0.99998655]
 [0.51573976]
 [0.99999894]
 [1.        ]
 [0.99984217]
 [1.        ]
 [0.98479323]
 [0.97763361]]


In [9]:
m_count = 0
count = 0
for i in range(y_test.shape[0]):
    print('test sample ', i)
    if np.sign(preds[i]-0.5) != y_test[i]:
        print('misclassified!!')
        m_count += 1
    print('predicted probablity of being +1 is: ', preds[i])
    count += 1
    print('label is', y_test[i])
    print('\n')
    
print("\n", m_count, "\n", count)

test sample  0
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  1
predicted probablity of being +1 is:  [0.9999881]
label is [1.]


test sample  2
misclassified!!
predicted probablity of being +1 is:  [0.90501521]
label is [0.]


test sample  3
predicted probablity of being +1 is:  [0.98174357]
label is [1.]


test sample  4
predicted probablity of being +1 is:  [0.99995874]
label is [1.]


test sample  5
predicted probablity of being +1 is:  [1.]
label is [1.]


test sample  6
predicted probablity of being +1 is:  [0.99999978]
label is [1.]


test sample  7
predicted probablity of being +1 is:  [0.99993037]
label is [1.]


test sample  8
misclassified!!
predicted probablity of being +1 is:  [0.9060508]
label is [0.]


test sample  9
misclassified!!
predicted probablity of being +1 is:  [0.93344188]
label is [0.]


test sample  10
predicted probablity of being +1 is:  [0.99999109]
label is [1.]


test sample  11
predicted probablity of being +1 is:  [1.]
label is