# **LIBRARIES**

In [None]:
import os

import random
import cv2
import numpy as np
import pandas as pd
import keras
import tensorflow as tf

import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

import warnings
warnings.filterwarnings('ignore')

In [None]:
def random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    keras.utils.set_random_seed(seed)

    os.environ["PYTHONHASHSEED"] = "42"

In [None]:
random_seed(42)

# **METRICS**

In [None]:
def fdr(y_true, y_pred):
    FP = np.sum((y_true == 0) & (y_pred == 1))
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    fdr_ = FP / (FP + TP) if (FP + TP) > 0 else 0
    
    return fdr_

In [None]:
def fnr(y_true, y_pred):
    FN = np.sum((y_true == 1) & (y_pred == 0))
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    fnr_ = FN / (FN + TP) if (FN + TP) > 0 else 0
    
    return fnr_

In [None]:
def specificity(y_true, y_pred):
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    specificity_ = TN / (TN + FP) if (TN + FP) > 0 else 0
    
    return specificity_

In [None]:
def npv(y_true, y_pred):
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    npv_ = TN / (TN + FN) if (TN + FN) > 0 else 0
    
    return npv_

# **DATASET**

In [None]:
train_cxr = pd.read_csv('/kaggle/input/pneumonia-detection-features-datasets/train/cxr.csv')
train_ch0 = pd.read_csv('/kaggle/input/pneumonia-detection-features-datasets/train/segment.csv')
train_ch1 = pd.read_csv('/kaggle/input/pneumonia-detection-features-datasets/train/segment_with_convexhull.csv')

test_cxr = pd.read_csv('/kaggle/input/pneumonia-detection-features-datasets/test/cxr.csv')
test_ch0 = pd.read_csv('/kaggle/input/pneumonia-detection-features-datasets/test/segment.csv')
test_ch1 = pd.read_csv('/kaggle/input/pneumonia-detection-features-datasets/test/segment_with_convexhull.csv')

In [None]:
x_train_cxr = train_cxr.drop(['class'], axis=1).to_numpy()
x_train_ch0 = train_ch0.drop(['class'], axis=1).to_numpy()
x_train_ch1 = train_ch1.drop(['class'], axis=1).to_numpy()
y_train = train_cxr['class'].to_numpy()

x_test_cxr = test_cxr.drop(['class'], axis=1).to_numpy()
x_test_ch0 = test_ch0.drop(['class'], axis=1).to_numpy()
x_test_ch1 = test_ch1.drop(['class'], axis=1).to_numpy()
y_test = test_cxr['class'].to_numpy()

print(np.shape(x_train_cxr))
print(np.shape(x_train_ch0))
print(np.shape(x_train_ch1))
print(np.shape(y_train))
print(np.shape(x_test_cxr))
print(np.shape(x_test_ch0))
print(np.shape(x_test_ch1))
print(np.shape(y_test))

In [None]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)

class_weight_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

print(class_weight_dict)

# **LOGISTIC REGRESSION**

## **CXR**

In [None]:
random_seed(42)

param_grid = {
    'max_iter': [50, 100, 150, 200, 250, 300],
    'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

param_list = list(ParameterGrid(param_grid))

best_score = 0
best_params = None

for i in tqdm(range(len(param_list))):
    params = param_list[i]
    clf = LogisticRegression(**params, class_weight=class_weight_dict, random_state=42)
    clf.fit(x_train_cxr, y_train)
    score = clf.score(x_test_cxr, y_test)
    print('{}: {}'.format(params, score))

    if score > best_score:
        best_score = score
        best_params = params

print("Best Parameters:", best_params)

In [None]:
clf = LogisticRegression(
    C=0.05,
    max_iter=50,
    penalty='l2',
    solver='saga',
    class_weight=class_weight_dict,
    random_state=42
)

clf.fit(x_train_cxr, y_train)

y_pred = clf.predict(x_test_cxr)

print('accuracy = {}'.format(accuracy_score(y_test, y_pred)))
print('precision = {}'.format(precision_score(y_test, y_pred)))
print('FDR = {}'.format(fdr(y_test, y_pred)))
print('recall = {}'.format(recall_score(y_test, y_pred)))
print('FNR = {}'.format(fnr(y_test, y_pred)))
print('specificity = {}'.format(specificity(y_test, y_pred)))
print('NPV = {}'.format(npv(y_test, y_pred)))
print('f1-score = {}'.format(f1_score(y_test, y_pred)))
print('AUC = {}'.format(roc_auc_score(y_test, y_pred)))
print('MCC = {}'.format(matthews_corrcoef(y_test, y_pred)))

## **CH0**

In [None]:
random_seed(42)

param_grid = {
    'max_iter': [50, 100, 150, 200, 250, 300],
    'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

param_list = list(ParameterGrid(param_grid))

best_score = 0
best_params = None

for i in tqdm(range(len(param_list))):
    params = param_list[i]
    clf = LogisticRegression(**params, class_weight=class_weight_dict, random_state=42)
    clf.fit(x_train_ch0, y_train)
    score = clf.score(x_test_ch0, y_test)
    print('{}: {}'.format(params, score))

    if score > best_score:
        best_score = score
        best_params = params

print("Best Parameters:", best_params)

In [None]:
clf = LogisticRegression(
    C=0.5,
    max_iter=200,
    penalty='l1',
    solver='saga',
    class_weight=class_weight_dict,
    random_state=42
)

clf.fit(x_train_ch0, y_train)

y_pred = clf.predict(x_test_ch0)

print('accuracy = {}'.format(accuracy_score(y_test, y_pred)))
print('precision = {}'.format(precision_score(y_test, y_pred)))
print('FDR = {}'.format(fdr(y_test, y_pred)))
print('recall = {}'.format(recall_score(y_test, y_pred)))
print('FNR = {}'.format(fnr(y_test, y_pred)))
print('specificity = {}'.format(specificity(y_test, y_pred)))
print('NPV = {}'.format(npv(y_test, y_pred)))
print('f1-score = {}'.format(f1_score(y_test, y_pred)))
print('AUC = {}'.format(roc_auc_score(y_test, y_pred)))
print('MCC = {}'.format(matthews_corrcoef(y_test, y_pred)))

## **CH1**

In [None]:
random_seed(42)

param_grid = {
    'max_iter': [50, 100, 150, 200, 250, 300],
    'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

param_list = list(ParameterGrid(param_grid))

best_score = 0
best_params = None

for i in tqdm(range(len(param_list))):
    params = param_list[i]
    clf = LogisticRegression(**params, class_weight=class_weight_dict, random_state=42)
    clf.fit(x_train_ch1, y_train)
    score = clf.score(x_test_ch1, y_test)
    print('{}: {}'.format(params, score))

    if score > best_score:
        best_score = score
        best_params = params

print("Best Parameters:", best_params)

In [None]:
clf = LogisticRegression(
    C=0.1,
    max_iter=50,
    penalty='l2',
    solver='saga',
    class_weight=class_weight_dict,
    random_state=42
)

clf.fit(x_train_ch1, y_train)

y_pred = clf.predict(x_test_ch1)

print('accuracy = {}'.format(accuracy_score(y_test, y_pred)))
print('precision = {}'.format(precision_score(y_test, y_pred)))
print('FDR = {}'.format(fdr(y_test, y_pred)))
print('recall = {}'.format(recall_score(y_test, y_pred)))
print('FNR = {}'.format(fnr(y_test, y_pred)))
print('specificity = {}'.format(specificity(y_test, y_pred)))
print('NPV = {}'.format(npv(y_test, y_pred)))
print('f1-score = {}'.format(f1_score(y_test, y_pred)))
print('AUC = {}'.format(roc_auc_score(y_test, y_pred)))
print('MCC = {}'.format(matthews_corrcoef(y_test, y_pred)))