<center>

### COSC2753 - Machine Learning

# **Logistic Regression**

<center>────────────────────────────</center>
&nbsp;


# I. Global Configuration

In [41]:
import sys
import importlib
import tabulate
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import sklearn
import statsmodels
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import RandomizedSearchCV
from sklearn.exceptions import FitFailedWarning
import warnings
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Reload modules
sys.path.append("../../")  # Root directory
modules_to_reload = [
    "scripts.styler",
    "scripts.neko",
    "scripts.utils",
]

# Reload modules if they have been modified
missing_modules = []

for module_name in modules_to_reload:
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])
    else:
        missing_modules.append(module_name)

# Recache missing modules
if missing_modules:
    print(f"Modules {missing_modules} not found. \nRecaching...")

# Import user-defined scripts
from scripts.styler import Styler
from scripts.neko import Neko
from scripts.utils import Utils


# Initialize styler
styler = Styler()  # Text Styler

# Check package versions
styler.draw_box("Checking Package Versions...")

try:
    with open("../../requirements.txt", "r") as file:
        requirements = file.readlines()
except FileNotFoundError:
    print(f"File '../../requirements.txt' not found.")

packages_to_check = [np, pd, sns, matplotlib, tabulate, sklearn, statsmodels, imblearn]

for package in packages_to_check:
    Utils.version_check(package, requirements=requirements)

styled_text = styler.style(
    "\nDone checking packages version...\n", bold=True, italic=True
)
print(styled_text)

# Initialize objects
styler.draw_box("Initializing Project...")
neko = Neko()  # Panda extension
bullet = ">>>"  # Bullet point
plt = matplotlib.pyplot  # Matplotlib

# Configuration
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.precision", 3)

styled_text = styler.style("Done initializing project...", bold=True, italic=True)
print(styled_text)

┌────────────────────────────────┐
│  Checking Package Versions...  │
└────────────────────────────────┘
>>> numpy is up to date: 1.26.4
>>> pandas is up to date: 2.2.1
>>> seaborn is up to date: 0.13.2
>>> matplotlib is up to date: 3.8.3
>>> tabulate is up to date: 0.9.0
>>> sklearn is up to date: 1.4.1.post1
>>> statsmodels is up to date: 0.14.1
>>> imblearn is up to date: 0.12.2
[1m[3m
Done checking packages version...
[0m
┌───────────────────────────┐
│  Initializing Project...  │
└───────────────────────────┘

    /\_____/\
   /  x   o  \
  ( ==  ^  == )       Neko has arrived!
   )         (        An data visualizing extension for analyzing DataFrames.
  (           )       Art: https://www.asciiart.eu/animals/cats.
 ( (  )   (  ) )
(__(__)___(__)__)

[1m[3mDone initializing project...[0m


# II. Data Loading

In [33]:
try:
    # Load data
    df_train = pd.read_csv("../../data/processed/data_train_processed.csv")
    df_test = pd.read_csv("../../data/test/data_test.csv")

    styler.draw_box("Data Loaded Successfully")

except FileNotFoundError:
    print("Error: File not found. Please check the file path.")
except Exception as e:
    print("An error occurred:", e)

┌────────────────────────────┐
│  Data Loaded Successfully  │
└────────────────────────────┘


In [23]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333056 entries, 0 to 333055
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HighBP                333056 non-null  int64  
 1   HighChol              333056 non-null  int64  
 2   CholCheck             333056 non-null  int64  
 3   BMI                   333056 non-null  float64
 4   Smoker                333056 non-null  int64  
 5   Stroke                333056 non-null  int64  
 6   HeartDiseaseorAttack  333056 non-null  int64  
 7   PhysActivity          333056 non-null  int64  
 8   Fruits                333056 non-null  int64  
 9   Veggies               333056 non-null  int64  
 10  HvyAlcoholConsump     333056 non-null  int64  
 11  AnyHealthcare         333056 non-null  int64  
 12  NoDocbcCost           333056 non-null  int64  
 13  GenHlth               333056 non-null  int64  
 14  MentHlth              333056 non-null  int64  
 15  

In [24]:
df_train.head(10)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,ExtraMedTest,ExtraAlcoholTest,Status
0,0,0,1,24.0,1,0,0,1,0,1,0,1,0,2,0,0.0,0,0,8,4,5,60.0,0.0,0
1,0,0,1,28.0,0,0,0,1,1,1,0,1,0,1,1,0.0,0,0,2,6,8,0.0,-64.0,0
2,0,0,1,36.0,1,0,0,1,1,0,0,1,1,3,5,7.5,1,0,3,2,1,-46.0,0.0,0
3,0,1,1,35.0,0,0,0,1,1,1,0,1,0,3,0,0.0,0,0,8,6,8,-83.0,-7.566,0
4,0,1,1,27.0,0,0,0,1,0,1,0,1,0,3,0,0.0,0,0,9,5,4,-58.0,0.0,0
5,1,1,1,26.0,1,0,1,1,0,1,0,1,1,4,0,0.0,0,0,12,4,6,-14.0,53.0,0
6,0,0,0,34.0,0,0,0,1,1,1,0,1,0,2,0,0.0,0,0,5,4,5,0.0,-2.0,0
7,0,0,1,28.0,0,0,0,1,1,1,0,1,0,2,2,5.0,0,0,6,6,8,-61.0,-98.0,0
8,0,1,1,33.0,1,0,0,1,1,1,0,1,0,3,0,0.0,0,0,8,5,7,70.0,0.0,0
9,0,0,1,28.0,0,0,0,1,1,1,0,1,0,2,0,0.0,0,1,7,5,8,-12.0,0.0,0


# III. Model Development

## 1. Feature Scaling

### Scaling Techniques for Logistic Regression Models

Logistic regression models often benefit from scaled features to enhance accuracy. When dealing with varied range numerical data, two common scaling techniques are **standardization** (using a *StandardScaler*) and **Normalisation** (using a *MinMaxScaler*).

**Limitations of Log Transformation and StandardScaler:** 
The *StandardScaler*, which typically involves log transformation followed by standardization, might not be suitable for this specific case. This method can encounter issues when the data contains zeros or negative values, potentially leading to inaccurate transformations.

Therefore, the *MinMaxScaler* is a more appropriate choice for this scenario. This technique effectively handles skewed data by rescaling each feature to a specific range (often 0 to 1). This ensures that all features contribute equally to the logistic regression model, improving its ability to learn the underlying relationships between features and the target variable.

In [34]:
# Scale features using min-max scaler (Normalization)
df_train = neko.scale_feature(df_train, "norm")

df_train.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,ExtraMedTest,ExtraAlcoholTest,Status
count,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0
mean,0.459,0.421,0.967,0.555,0.379,0.029,0.079,0.654,0.542,0.74,0.035,0.942,0.058,0.403,0.235,0.29,0.171,0.367,0.608,0.771,0.672,0.572,0.574,0.5
std,0.498,0.494,0.179,0.206,0.485,0.167,0.269,0.476,0.498,0.439,0.183,0.234,0.234,0.261,0.39,0.408,0.376,0.482,0.236,0.201,0.304,0.277,0.274,0.5
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.5,0.6,0.429,0.461,0.464,0.0
50%,0.0,0.0,1.0,0.518,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.667,0.8,0.714,0.5,0.503,0.5
75%,1.0,1.0,1.0,0.687,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.5,0.4,0.667,0.0,1.0,0.75,1.0,1.0,0.812,0.809,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 2. Model Training

In [35]:
# Split the data into training and testing sets
X = df_train.drop(columns=["Status"], axis=1)
y = df_train["Status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### 2.1 Model Evaluation (First Look)

Evaluation within the data preprocessing notebook indicates that the features **AnyHealthcare** and **MentHlth** do not exhibit a statistically significant correlation with the target variable **Status**. Consequently, these two features will be excluded from the dataset, and the model will be retrained to assess the impact of this refinement.

In [36]:
# Traing the model (With All Features)
styler.draw_box("Training the model (With All Features)")
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Classification report
report = classification_report(y_test, y_pred)
print(report)

# Traing the model (With reduced Features)
styler.draw_box("Training the model (With Reduced Features)")
model = LogisticRegression(max_iter=1000, random_state=42)

# Drop the specified columns from X_train and X_test
X_train_reduced = X_train.drop(columns=["AnyHealthcare", "MentHlth"])
X_test_reduced = X_test.drop(columns=["AnyHealthcare", "MentHlth"])

model.fit(X_train_reduced, y_train)

# Make predictions
y_pred = model.predict(X_test_reduced)

# Classification report
report = classification_report(y_test, y_pred)
print(report)

┌──────────────────────────────────────────┐
│  Training the model (With All Features)  │
└──────────────────────────────────────────┘
              precision    recall  f1-score   support

         0.0       0.79      0.80      0.80     33403
         1.0       0.80      0.79      0.79     33439

    accuracy                           0.80     66842
   macro avg       0.80      0.80      0.80     66842
weighted avg       0.80      0.80      0.80     66842

┌──────────────────────────────────────────────┐
│  Training the model (With Reduced Features)  │
└──────────────────────────────────────────────┘
              precision    recall  f1-score   support

         0.0       0.79      0.80      0.80     33403
         1.0       0.80      0.79      0.79     33439

    accuracy                           0.79     66842
   macro avg       0.79      0.79      0.79     66842
weighted avg       0.79      0.79      0.79     66842



**Evaluation Metrics after Feature Selection**

After applying feature selection, the evaluation metrics reveal a decline in model accuracy. This suggests that the removed features may have contained information critical to the model's performance.

→ *Therefore, to ensure optimal performance, all features in the current dataset will be retained for further analysis.*

### 2.2 Feature Selection

This section delves deeper into feature selection by employing a **Wrapper Method** known as **Recursive Feature Elimination with Cross-Validation (REFCV)**. This technique iteratively evaluates the performance of a chosen machine learning model on progressively smaller subsets of features. Features deemed least impactful on the model's performance are eliminated in each step. The RECFCV process continues until the optimal feature combination is identified, resulting in the model achieving its highest accuracy.

While sequential feature selection methods, such as forward selection or backward selection, might be considered for this task, they are often susceptible to instability and prone to overfitting. Due to these potential drawbacks, this approach is not employed in this instance. RECFCV, on the other hand, offers a more robust and reliable method for identifying the optimal feature subset that maximizes model performance.


In [28]:
styler.draw_box("Feature Selection (Wrapper Method - Recursive Feature Elimination with Cross-Validation (RFECV))")

# Create Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Create RFECV with Logistic Regression model
selector = RFECV(estimator=logistic_model, cv=5, scoring="accuracy")
selector.fit(X_train, y_train)

# Get the selected features
selected_features = X_train.columns[selector.support_]

print("Selected features using Logistic Regression and RFECV:", selected_features)
print("Size of selected features:", len(selected_features))

┌────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  Feature Selection (Wrapper Method - Recursive Feature Elimination with Cross-Validation (RFECV))  │
└────────────────────────────────────────────────────────────────────────────────────────────────────┘
Selected features using Logistic Regression and RFECV: Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income',
       'ExtraMedTest', 'ExtraAlcoholTest'],
      dtype='object')
Size of selected features: 23


### 2.3 Model Evaluation (Second Look)

In [42]:
# Traing the model
styler.draw_box("Training the model (Without Hyperparameter Tuning)")
model = LogisticRegression(max_iter=1000, random_state=42)

neko.evaluate_model(model, X_train[selected_features], y_train, X_test[selected_features], y_test)

┌──────────────────────────────────────────────────────┐
│  Training the model (Without Hyperparameter Tuning)  │
└──────────────────────────────────────────────────────┘
Classification Report for Training Data:
               precision    recall  f1-score   support

         0.0       0.79      0.80      0.80    133702
         1.0       0.80      0.79      0.79    133666

    accuracy                           0.79    267368
   macro avg       0.79      0.79      0.79    267368
weighted avg       0.79      0.79      0.79    267368

Classification Report for Testing Data:
               precision    recall  f1-score   support

         0.0       0.79      0.80      0.80     33403
         1.0       0.80      0.79      0.79     33439

    accuracy                           0.80     66842
   macro avg       0.80      0.80      0.80     66842
weighted avg       0.80      0.80      0.80     66842

Accuracy on Training Data: 0.7948221178301068
Accuracy on Testing Data: 0.7952036144938811



The feature selection process yielded a noteworthy result: all features were retained for model training. This outcome suggests that each feature contributes meaningfully to the model's final performance. Consequently, the entire feature set will be utilized for training. This decision is further supported by the observed improvements in accuracy and F1-score, which are critical metrics for assessing model effectiveness.

Furthermore, the close similarity between accuracy and F1-score on both the training and testing datasets indicates strong model generalizability. In simpler terms, the model performs well not only on the data it was trained on but also on unseen data, demonstrating its ability to learn underlying patterns and make accurate predictions on new information.

### 2.4 Model Tuning (Hyperparameter Optimization)

In [30]:
# Warning suppression
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Hyperparameter Tuning
param_grid = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": np.logspace(-4, 4, 50),
    "solver": ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
    "max_iter": [100, 1000, 2500, 5000],
    "l1_ratio": np.linspace(0, 1, 10),
}

Based on the analysis, there is a strong likelihood of achieving an optimal or near-optimal solution by setting the number of iterations to `60`. To further improve the probability of success, I propose increasing the number of iterations (n_iter) to `100`. This adjustment will allow the model to explore a broader range of hyperparameters, potentially identifying an even more optimal solution.

https://web.archive.org/web/20160701182750/http://blog.dato.com/how-to-evaluate-machine-learning-models-part-4-hyperparameter-tuning

In [31]:
# Polynomial degrees
degrees = range(1, 3)  # 1 to 2 (because of computational cost for higher degrees)

# Store best models
best_models = {}

# Loop through each degree
for degree in degrees:
    print(">>> Hyperparameter Tuning For Polynomial Degree:", degree)

    poly_features = PolynomialFeatures(degree=degree)
    X_train_poly = poly_features.fit_transform(X_train[selected_features])
    X_test_poly = poly_features.transform(X_test[selected_features])

    # Grid search
    logistic_regression = LogisticRegression(random_state=42)
    randon_search = RandomizedSearchCV(
        logistic_regression,
        param_distributions=param_grid,
        cv=5,
        scoring="f1_weighted",
        n_jobs=6,
        n_iter=100,
    )
    randon_search.fit(X_train_poly, y_train)

    # Store the best model and its parameters
    best_models[degree] = {
        "best_model": randon_search.best_estimator_,
        "best_params": randon_search.best_params_,
    }

    styler.draw_box(f"Best parameters for degree {degree}")
    print(randon_search.best_params_)

    # Evaluate model with best parameters
    best_logistic = randon_search.best_estimator_

    # Show classification report
    y_pred = best_logistic.predict(X_test_poly)

    print(f"\nClassification Report for degree {degree}:")
    print(classification_report(y_test, y_pred))

>>> Hyperparameter Tuning For Polynomial Degree: 1
┌────────────────────────────────┐
│  Best parameters for degree 1  │
└────────────────────────────────┘
{'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 5000, 'l1_ratio': 0.3333333333333333, 'C': 0.3906939937054613}

Classification Report for degree 1:
              precision    recall  f1-score   support

         0.0       0.75      0.75      0.75     33416
         1.0       0.74      0.74      0.74     33196

    accuracy                           0.75     66612
   macro avg       0.75      0.75      0.75     66612
weighted avg       0.75      0.75      0.75     66612

>>> Hyperparameter Tuning For Polynomial Degree: 2


KeyboardInterrupt: 