<center>

### COSC2753 - Machine Learning

# **Logistic Regression**

<center>────────────────────────────</center>
&nbsp;


# I. Global Configuration

In [9]:
import sys
import importlib
import tabulate
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import sklearn
import statsmodels
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import RandomizedSearchCV
from sklearn.exceptions import FitFailedWarning
import warnings
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Reload modules
sys.path.append("../../")  # Root directory
modules_to_reload = [
    "scripts.styler",
    "scripts.neko",
    "scripts.utils",
]

# Reload modules if they have been modified
missing_modules = []

for module_name in modules_to_reload:
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])
    else:
        missing_modules.append(module_name)

# Recache missing modules
if missing_modules:
    print(f"Modules {missing_modules} not found. \nRecaching...")

# Import user-defined scripts
from scripts.styler import Styler
from scripts.neko import Neko
from scripts.utils import Utils


# Initialize styler
styler = Styler()  # Text Styler

# Check package versions
styler.draw_box("Checking Package Versions...")

try:
    with open("../../requirements.txt", "r") as file:
        requirements = file.readlines()
except FileNotFoundError:
    print(f"File '../../requirements.txt' not found.")

packages_to_check = [np, pd, sns, matplotlib, tabulate, sklearn, statsmodels, imblearn]

for package in packages_to_check:
    Utils.version_check(package, requirements=requirements)

styled_text = styler.style(
    "\nDone checking packages version...\n", bold=True, italic=True
)
print(styled_text)

# Initialize objects
styler.draw_box("Initializing Project...")
neko = Neko()  # Panda extension
bullet = ">>>"  # Bullet point
plt = matplotlib.pyplot  # Matplotlib

# Configuration
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.precision", 3)

styled_text = styler.style("Done initializing project...", bold=True, italic=True)
print(styled_text)

┌────────────────────────────────┐
│  Checking Package Versions...  │
└────────────────────────────────┘
>>> numpy is up to date: 1.26.4
>>> pandas is up to date: 2.2.1
>>> seaborn is up to date: 0.13.2
>>> matplotlib is up to date: 3.8.3
>>> tabulate is up to date: 0.9.0
>>> sklearn is up to date: 1.4.1.post1
>>> statsmodels is up to date: 0.14.1
>>> imblearn is up to date: 0.12.2
[1m[3m
Done checking packages version...
[0m
┌───────────────────────────┐
│  Initializing Project...  │
└───────────────────────────┘

    /\_____/\
   /  x   o  \
  ( ==  ^  == )       Neko has arrived!
   )         (        An data visualizing extension for analyzing DataFrames.
  (           )       Art: https://www.asciiart.eu/animals/cats.
 ( (  )   (  ) )
(__(__)___(__)__)

[1m[3mDone initializing project...[0m


# II. Data Loading

In [10]:
# Load data
df = pd.read_csv("../../data/processed/data_train_processed.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334210 entries, 0 to 334209
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HighBP                334210 non-null  int64  
 1   HighChol              334210 non-null  int64  
 2   CholCheck             334210 non-null  int64  
 3   BMI                   334210 non-null  float64
 4   Smoker                334210 non-null  int64  
 5   Stroke                334210 non-null  int64  
 6   HeartDiseaseorAttack  334210 non-null  int64  
 7   PhysActivity          334210 non-null  int64  
 8   Fruits                334210 non-null  int64  
 9   Veggies               334210 non-null  int64  
 10  HvyAlcoholConsump     334210 non-null  int64  
 11  AnyHealthcare         334210 non-null  int64  
 12  NoDocbcCost           334210 non-null  int64  
 13  GenHlth               334210 non-null  int64  
 14  MentHlth              334210 non-null  int64  
 15  

In [11]:
df.head(10)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,ExtraMedTest,ExtraAlcoholTest,Status
0,0,0,1,24.0,1,0,0,1,0,1,0,1,0,2,0,0.0,0,0,8,4,5,60.0,0.0,0
1,0,0,1,28.0,0,0,0,1,1,1,0,1,0,1,1,0.0,0,0,2,6,8,0.0,-64.0,0
2,0,0,1,36.0,1,0,0,1,1,0,0,1,1,3,5,7.5,1,0,3,2,1,-46.0,0.0,0
3,0,1,1,35.0,0,0,0,1,1,1,0,1,0,3,0,0.0,0,0,8,6,8,-83.0,-7.566,0
4,0,1,1,27.0,0,0,0,1,0,1,0,1,0,3,0,0.0,0,0,9,5,4,-58.0,0.0,0
5,1,1,1,26.0,1,0,1,1,0,1,0,1,1,4,0,0.0,0,0,12,4,6,-14.0,53.0,0
6,0,0,0,34.0,0,0,0,1,1,1,0,1,0,2,0,0.0,0,0,5,4,5,0.0,-2.0,0
7,0,0,1,28.0,0,0,0,1,1,1,0,1,0,2,2,5.0,0,0,6,6,8,-61.0,-98.0,0
8,0,1,1,33.0,1,0,0,1,1,1,0,1,0,3,0,0.0,0,0,8,5,7,70.0,0.0,0
9,0,0,1,28.0,0,0,0,1,1,1,0,1,0,2,0,0.0,0,1,7,5,8,-12.0,0.0,0


# III. Model Development

## 1. Feature Scaling

In [12]:
# Scale features using min-max scaler (Normalization)
df = neko.scale_feature(df, "norm")

df.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,ExtraMedTest,ExtraAlcoholTest,Status
count,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0,334210.0
mean,0.459,0.421,0.967,0.555,0.379,0.029,0.079,0.654,0.542,0.74,0.035,0.942,0.058,0.403,0.235,0.29,0.171,0.367,0.608,0.771,0.672,0.572,0.574,0.5
std,0.498,0.494,0.179,0.206,0.485,0.167,0.269,0.476,0.498,0.439,0.183,0.234,0.234,0.261,0.39,0.408,0.376,0.482,0.236,0.201,0.304,0.277,0.274,0.5
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.5,0.6,0.429,0.461,0.464,0.0
50%,0.0,0.0,1.0,0.518,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.667,0.8,0.714,0.5,0.503,0.5
75%,1.0,1.0,1.0,0.687,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.5,0.4,0.667,0.0,1.0,0.75,1.0,1.0,0.812,0.809,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 2. Model Training

In [13]:
# Split the data into training and testing sets
X = df.drop(columns=["Status"], axis=1)
y = df["Status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## 2.1 Feature Selection

In [14]:
# # Forward feature selection
# forward_selector = SequentialFeatureSelector(
#     LogisticRegression(),
#     n_features_to_select="auto",
#     direction="forward",
#     cv=5,
#     n_jobs=6,
# )
# forward_selector.fit(X_train, y_train)
# forward_selected_features = X.columns[forward_selector.support_]

# # Backward feature selection
# backward_selector = SequentialFeatureSelector(
#     LogisticRegression(),
#     n_features_to_select="auto",
#     direction="backward",
#     cv=5,
#     n_jobs=6,
# )
# backward_selector.fit(X_train, y_train)
# backward_selected_features = X.columns[backward_selector.support_]

# # Combine selected features
# selected_features = list(
#     set(forward_selected_features) | set(backward_selected_features)
# )

# print("Combined selected features:", selected_features)
# models = [
#     ("Logistic Regression", LogisticRegression()),
#     ("Random Forest", RandomForestClassifier()),
# ]

# # Create RFE with GridSearchCV for each model
# selected_features_per_model = {}
# for name, model in models:
#     rfe_selector = RFECV(model, cv=5, scoring="f1")
#     param_grid = {"cv": [3, 5, 10], "step": [1, 2]}
#     grid_search = GridSearchCV(rfe_selector, param_grid, cv=5, n_jobs=-1)
#     grid_search.fit(X_train, y_train)
#     best_rfe = grid_search.best_estimator_
#     selected_features_per_model[name] = X.columns[best_rfe.support_]

# # Print selected features for each model
# for name, selected_features in selected_features_per_model.items():
#     print(f"Selected features for {name} using optimized RFECV:", selected_features)

# Create Logistic Regression estimator
# estimator = RandomForestClassifier()

# # Create RFECV with estimator
# selector = RFECV(
#     estimator=estimator, cv=5, min_features_to_select=5, n_jobs=-1, scoring="accuracy"
# )  # Adjust the minimum number of features as needed
# selector.fit(X_train, y_train)

# # Get the selected features
# selected_features = X_train.columns[selector.support_]

# print("Selected features:", selected_features)
# print("Size of selected features:", len(selected_features))

In [15]:
# Traing the model
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.79      0.80      0.80     33403
         1.0       0.80      0.79      0.79     33439

    accuracy                           0.80     66842
   macro avg       0.80      0.80      0.80     66842
weighted avg       0.80      0.80      0.80     66842



In [16]:
# warnings.filterwarnings("ignore", category=FitFailedWarning)

# # Hyperparameter tuning
# param_dist = {
#     "poly__degree": [1],
#     "logistic__penalty": ["l1", "l2", "elasticnet", "none"],
#     "logistic__C": np.logspace(-4, 4, 20),
#     "logistic__solver": ["lbfgs", "newton-cg", "liblinear", "sag", "saga"],
#     "logistic__max_iter": [1000, 2500, 5000, 10000],
# }

# # Create pipeline
# pipeline = Pipeline(
#     [
#         ("poly", PolynomialFeatures()),
#         ("logistic", LogisticRegression()),
#     ]
# )

# # Random search
# random_search = RandomizedSearchCV(
#     pipeline,
#     param_distributions=param_dist,
#     n_iter=100,
#     cv=5,
#     scoring="accuracy",
#     n_jobs=6,
# )
# random_search.fit(X_train[selected_features], y_train)

# print("Best parameters:", random_search.best_params_)

# # Evaluate model with best parameters
# best_pipeline = random_search.best_estimator_

# # Show classification report
# y_pred = best_pipeline.predict(X_test[selected_features])

# print("Classification Report:")
# print(classification_report(y_test, y_pred))