## Lab 4 - Logistic Regression


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from random import shuffle
import random
random.seed(123)

X, y = datasets.load_breast_cancer(return_X_y=True)
print("#samples: {}, #features: {}".format(X.shape[0], X.shape[1]))


print(X, '\n', type(X))
print('\n',y, '\n', type(y))

#samples: 569, #features: 30
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]] 
 <class 'numpy.ndarray'>

 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0

In [24]:
# practice slicing to remember: 

G = X.copy()
print(id(G), id(X))
print(id(G) == id(X)) # False 

print(G.shape)
print(G[1:2, 3:5]) # should be seeing row 1-2 and col 3-5
print(G[0:2, 2:5]) # forgot that python grabs the range specified -1 ... as the start is inclusive and the end is exclusive.. so its [2,5) .. werid but gotta remember.. otherwise the sytax is easy enough 

2179548934384 2179528304336
False
(569, 30)
[[1.326e+03 8.474e-02]]
[[1.228e+02 1.001e+03 1.184e-01]
 [1.329e+02 1.326e+03 8.474e-02]]


## Split and Normalize Data

In [10]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.12, random_state=0)

print("train_val: {}, test: {}".format(X_train_val.shape[0], X_test.shape[0]))

normalizer = StandardScaler()
X_train_val = normalizer.fit_transform(X_train_val)
X_test = normalizer.transform(X_test)

print(type(X_train_val), X_train_val.shape[:]) # colon to get rows and cols. 0 for rows 1 for cols 

train_val: 500, test: 69
<class 'numpy.ndarray'> (500, 30)


## Train log reg model and select hyper parameter

In [32]:
# here we use 5-fold cross-validation
folds = 5

# get the number of samples in the training and validation set
num_train_val = X_train_val.shape[0] 

# shuffle the index of samples in the train_val set
index_of_samples = np.arange(num_train_val) 
shuffle(index_of_samples)

print('index_of_samples shape', index_of_samples.shape)

# split the index of the train_valid set into 5 folds
index_of_folds = index_of_samples.reshape(folds, -1) # supplying '-1' infers the value of the second argument based on the length of the array and the remaining dimensions.
print('index of folds', index_of_folds)
print('index of folds shape', index_of_folds.shape)

# index_of_foldsT = index_of_samples.reshape(folds, 100) # same output ... just not using inferred input
# print('index of foldsT', index_of_foldsT)
# print('index of folds shapeT', index_of_foldsT.shape)


# potential hyperparameters. 
#These hyperparameters are just used for illustration. 
#You should try more hyperparameters to get a good model.
#The hyperparameters must be nonnegative!
regularization_coefficient = [0.1, 0.5, 1.0, 5.0, 10.0]

best_acc = 0.0
best_reg = 0.0

for reg in regularization_coefficient:
    
    # 5-fold cross-validation
    sum_acc = 0.0
    for fold in range(folds):
        
        index_of_folds_temp = index_of_folds.copy()
        
        valid_index = index_of_folds_temp[fold,:].reshape(-1) #get the index of the validation set
        train_index = np.delete(index_of_folds_temp, fold, 0).reshape(-1) #get the index of the training set
     
        # training set
        X_train = X_train_val[train_index]
        y_train = y_train_val[train_index]
        
        # validation set
        X_valid = X_train_val[valid_index]
        y_valid = y_train_val[valid_index]
                
        # build the model with different hyperparameters
        clf = LogisticRegression(penalty='l2', C=reg, solver='lbfgs')
        
        #train the model with the training set
        clf.fit(X_train, y_train)
        
        y_valid_pred = clf.predict(X_valid)
        acc = accuracy_score(y_valid, y_valid_pred)
        
        sum_acc += acc
    
    cur_acc = sum_acc / folds
    
    print("reg_coeff: {}, acc: {:.3f}".format(1.0/reg, cur_acc))
    
    # store the best hyperparameter
    if cur_acc > best_acc:
        best_acc = cur_acc
        best_reg = reg
        
print('best_acc', best_acc, 'best_reg', best_reg) # this value is actually changing from one run to the next, suggesting more fine reg terms may be necessary to converge on one best value ... also suggests there is some inherent noise in the model which comes through in results.. it's not always producing the same accuracy measures and differs by as much as 3% I think 

index_of_samples shape (500,)
index of folds [[424 400 382 148  19 441 463 338 427 301  11 211  12 286 124 437 221 432
  245 202 215 414  45 139 214 307  67 254 282 125  85  43 443  83 248 439
  114 342 285 310 403 175 428 363 418 345 482 355 259 480 151 456 243 425
  109 454 318 343 244  20 337 476 247  28 378  18 198 118  84 130 293 108
  113  86 371 162 216 313 302 354  56 314  98 249 423 341 140  10 419 288
  184 100 209 223 227 381 344  66  35 326]
 [411  96 389 450   8 466  73 468 191 388 491  95 421 104  57 217 264 387
  212 165 379  91 188 396 335 292  89  61  16  88 496 194  93   3 452 290
  250 412 474 393  17 306 449 295 361  44 116 279 347 119 230  72 445 204
   13  49   5 246 272 263 377  29 495  94  23 166 150  87 121 224   6 172
  232 107  54  42 304 384 241 176  74 117 126 174 208 275 484 179 461 284
  251 228 218 258 333 105 145 252 328 143]
 [156  34 386  78 296 311   2 160 317 135 442 187 406  82 436 351 475 334
  240 277 190 141 226 270 299 265 269 487 368 390 231 3

## Evaluate the learned model

In [33]:
# retrain the model
clf = LogisticRegression(penalty='l2', C=best_reg, solver='lbfgs')
clf.fit(X_train_val, y_train_val)

# evaluate the model on the testing set
y_test_pred = clf.predict(X_test)


acc = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)


print("accuracy: {:.3f}, recall: {:.3f}, precision: {:.3f}, f1: {:.3f},".format(acc, recall, precision, f1)) # a perfect fucking model. 

accuracy: 1.000, recall: 1.000, precision: 1.000, f1: 1.000,
