# Smoker Status Prediction - Logistic Regression


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

print("Libraries loaded successfully.")

Libraries loaded successfully.


## Loading and Preprocessing Data

In [3]:
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
if train_df.duplicated().sum() > 0:
    train_df = train_df.drop_duplicates()


X = train_df.drop('smoking', axis=1)
y = train_df['smoking']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X.columns)
test_scaled = pd.DataFrame(test_scaled, columns=test_df.columns)

print("Preprocessing complete.")

Train dataset shape: (38984, 23)
Test dataset shape: (16708, 22)
Preprocessing complete.


## Logistic Regression Model implementations



In [None]:
#standard LR model 
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

y_pred = lr_model.predict(X_val_scaled)

accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.5f}")

print(classification_report(y_val, y_pred))

Validation Accuracy: 0.71826
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_model = LogisticRegression(
    C=1.0,
    solver='lbfgs',
    max_iter=100,
    random_state=42
)

lr_model.fit(X_train_scaled, y_train)

y_pred = lr_model.predict(X_val_scaled)

print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.5f}")
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.71826
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_model = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=100,
    random_state=42
)

lr_model.fit(X_train_scaled, y_train)

y_pred = lr_model.predict(X_val_scaled)

print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.5f}")
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.71826
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

penalties = ["l1", "l2"]

for p in penalties:
    print(f"\n----- Testing penalty = {p} (liblinear solver) -----")

    lr_model = LogisticRegression(
        penalty=p,
        solver='liblinear',
        C=1.0,              
        max_iter=100,
        random_state=42
    )
    
    lr_model.fit(X_train_scaled, y_train)
    y_pred = lr_model.predict(X_val_scaled)

    print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.5f}")
    print(classification_report(y_val, y_pred))



----- Testing penalty = l1 (liblinear solver) -----
Validation Accuracy: 0.71840
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694


----- Testing penalty = l2 (liblinear solver) -----
Validation Accuracy: 0.71840
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



##  Hyperparameter Tuning for Logistic Regression
we saw that solver liblinear and lbfgs gives good accuracy from previous analysis, so now we'll be doing hyperparameter tuning using GridSearchCV and Optuna to get the optimal accuracies and parameter set.

In [6]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200, 500]
}

grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.5f}")

best_lr = grid_search.best_estimator_
y_pred_tuned = best_lr.predict(X_val_scaled)
print(f"Tuned Validation Accuracy: {accuracy_score(y_val, y_pred_tuned):.5f}")
print(classification_report(y_val, y_pred_tuned))

Best Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'liblinear'}
Best Cross-Validation Score: 0.72383
Tuned Validation Accuracy: 0.71855
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



more separated values 

In [7]:
param_grid = {
    'C': [0.005, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200, 300, 500]
}

grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.5f}")

best_lr = grid_search.best_estimator_

from sklearn.metrics import accuracy_score, classification_report

y_pred_tuned = best_lr.predict(X_val_scaled)

print(f"Tuned Validation Accuracy: {accuracy_score(y_val, y_pred_tuned):.5f}")
print(classification_report(y_val, y_pred_tuned))


Best Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.72383
Tuned Validation Accuracy: 0.71855
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



a centered GridSearchCV based on the optimal parameters found from last 2 GridSearchCV searches to go towards maximum optimality

In [8]:
param_grid = [
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs'],
        'penalty': ['l2'],
        'max_iter': [100, 200, 500]
    },
    
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'max_iter': [100, 200, 500]
    }
]

grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f"Best Cross-Validation Score: {grid_search.best_score_:.5f}")

best_lr = grid_search.best_estimator_

y_pred_tuned = best_lr.predict(X_val_scaled)

from sklearn.metrics import accuracy_score, classification_report
print(f"Tuned Validation Accuracy: {accuracy_score(y_val, y_pred_tuned):.5f}")
print(classification_report(y_val, y_pred_tuned))


Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.72383
Tuned Validation Accuracy: 0.71855
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



adding parameter class_weight since non-smokers dominate the smokers so giving more weightage to smokers can lead to good results(a hunch)

In [9]:
param_grid = [
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs'],
        'penalty': ['l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [100, 200, 500]
    },
    
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [100, 200, 500]
    }
]

grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.5f}")

best_lr = grid_search.best_estimator_

from sklearn.metrics import accuracy_score, classification_report

y_pred_tuned = best_lr.predict(X_val_scaled)
print(f"Tuned Validation Accuracy: {accuracy_score(y_val, y_pred_tuned):.5f}")
print(classification_report(y_val, y_pred_tuned))


Best Parameters: {'C': 0.1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.72383
Tuned Validation Accuracy: 0.71855
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



introducing tol(tolerance for stopping criteria) for faster convergence and moving to the optimal parameter early

In [19]:
param_grid = [
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs'],
        'penalty': ['l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [100, 200, 500],
        'tol': [1e-4, 1e-3, 1e-2]
    },
    
    {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [100, 200, 500],
        'tol': [1e-4, 1e-3, 1e-2]
    }
]

grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.5f}")

best_lr = grid_search.best_estimator_

from sklearn.metrics import accuracy_score, classification_report

y_pred_tuned = best_lr.predict(X_val_scaled)

print(f"Tuned Validation Accuracy: {accuracy_score(y_val, y_pred_tuned):.5f}")
print(classification_report(y_val, y_pred_tuned))


Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best Parameters: {'C': 1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.001}
Best Cross-Validation Accuracy: 0.72409
Tuned Validation Accuracy: 0.71826
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      4242
           1       0.63      0.56      0.59      2452

    accuracy                           0.72      6694
   macro avg       0.70      0.68      0.69      6694
weighted avg       0.71      0.72      0.71      6694



after several tries with no major improvement, based on above search results, i did another GridSearchCV search to find the best parameters for the model, also i included the logarithmic parameter search range for parameter C to ensure the best possible value for it

In [20]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'C': np.logspace(-5, 3, 15),  
        'solver': ['lbfgs'],
        'penalty': ['l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [200, 500],
        'tol': [1e-4, 1e-3]
    },
    {
        'C': np.logspace(-5, 3, 15),
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [200, 500],
        'tol': [1e-4, 1e-3]
    }
]

grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best Parameters: {'C': np.float64(0.3727593720314938), 'class_weight': None, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.001}
Best CV Accuracy: 0.7241246402682602


significant improvement, so i did another search with more wide range

In [10]:
param_grid = [
    {
        'C': np.logspace(-6, 4, 25),
        'solver': ['lbfgs'],
        'penalty': ['l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [200, 500, 800],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2]
    },
    {
        'C': np.logspace(-6, 4, 25),
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'class_weight': [None, 'balanced'],
        'max_iter': [200, 500, 800],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2]
    }
]

grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


Exception ignored in: <function ResourceTracker.__del__ at 0x731a70d86020>
Traceback (most recent call last):
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7c14f2d86020>
Traceback (most recent call last):
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ 

Best Parameters: {'C': np.float64(0.26101572156825387), 'class_weight': None, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.001}
Best CV Accuracy: 0.7241619885409025


since GridSearchCV takes a lot of training time, it is insignificant to try again and again, so i went for the optuna search whose parameter range searches were inspired from the previous GridSearchCV searches

In [11]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [12]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

def objective(trial):
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
    if solver == "lbfgs":
        penalty = "l2"
    else:
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    
    C = trial.suggest_float("C", 1e-6, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    max_iter = trial.suggest_categorical("max_iter", [200, 500, 800])
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)
    
    model = LogisticRegression(
        C=C,
        solver=solver,
        penalty=penalty,
        class_weight=class_weight,
        max_iter=max_iter,
        tol=tol,
        random_state=42
    )
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(
        model,
        X_train_scaled,
        y_train,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1
    )
    
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150, show_progress_bar=True)

print("Best Parameters:", study.best_params)
print("Best CV Accuracy:", study.best_value)

best_params = study.best_params

final_lr = LogisticRegression(
    C=best_params["C"],
    solver=best_params["solver"],
    penalty=best_params.get("penalty", "l2"),
    class_weight=best_params["class_weight"],
    max_iter=best_params["max_iter"],
    tol=best_params["tol"],
    random_state=42
)

final_lr.fit(X_train_scaled, y_train)

from sklearn.metrics import accuracy_score, classification_report

y_pred = final_lr.predict(X_val_scaled)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


[I 2025-12-08 14:28:46,336] A new study created in memory with name: no-name-3bd037c5-386d-4a77-a0af-fbd2bbdd6365


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-12-08 14:28:48,028] Trial 0 finished with value: 0.6337354794987351 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 1.2231324526805997e-05, 'class_weight': 'balanced', 'max_iter': 800, 'tol': 0.001811386042496901}. Best is trial 0 with value: 0.6337354794987351.
[I 2025-12-08 14:28:49,254] Trial 1 finished with value: 0.7214358366930386 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.3955743168180872, 'class_weight': 'balanced', 'max_iter': 500, 'tol': 0.0008528845744752513}. Best is trial 1 with value: 0.7214358366930386.
[I 2025-12-08 14:28:49,460] Trial 2 finished with value: 0.6337354794987351 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 2.9043992669400886e-06, 'class_weight': 'balanced', 'max_iter': 200, 'tol': 0.0037562999819829115}. Best is trial 1 with value: 0.7214358366930386.
[I 2025-12-08 14:28:49,655] Trial 3 finished with value: 0.7131067882264349 and parameters: {'solver': 'lbfgs', 'C': 0.0005272977270803079, 'class_

Exception ignored in: <function ResourceTracker.__del__ at 0x7e8b01782020>
Traceback (most recent call last):
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7a6b0a38e020>
Traceback (most recent call last):
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/iiitb/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ 

no major increment in accuracy lead me to rethink about everything and i did some analysis of my EDA and found out that many of the features in smoker dataset are right skewed, so i applied log transformation to them in order to make them more normally distributed 

## handling outliers using log transformation of skewed features

In [13]:
import numpy as np

skewed_cols = [
    'triglyceride', 'LDL', 'Gtp',
    'AST', 'ALT', 'serum creatinine',
    'fasting blood sugar'
]

for col in skewed_cols:
    X_train[col] = np.log1p(X_train[col])
    X_val[col] = np.log1p(X_val[col])


In [14]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def objective(trial):
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
    if solver == "lbfgs":
        penalty = "l2"
    else:
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    
    C = trial.suggest_float("C", 1e-6, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    max_iter = trial.suggest_categorical("max_iter", [200, 500, 800])
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)
    
    model = LogisticRegression(
        C=C,
        solver=solver,
        penalty=penalty,
        class_weight=class_weight,
        max_iter=max_iter,
        tol=tol,
        random_state=42
    )
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv=cv,
        scoring="accuracy",
        n_jobs=1
    )
    
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150, show_progress_bar=True)

print("Best Parameters:", study.best_params)
print("Best CV Accuracy:", study.best_value)


[I 2025-12-08 14:34:48,672] A new study created in memory with name: no-name-53f9f37b-2193-4ff5-b5d4-02a9e523797b


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-12-08 14:34:49,496] Trial 0 finished with value: 0.7375715809920033 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 18.070665903501506, 'class_weight': None, 'max_iter': 800, 'tol': 0.0003395206591377154}. Best is trial 0 with value: 0.7375715809920033.
[I 2025-12-08 14:34:49,722] Trial 1 finished with value: 0.6337354794987351 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 1.0166563368946883e-05, 'class_weight': None, 'max_iter': 800, 'tol': 4.02406158668248e-05}. Best is trial 0 with value: 0.7375715809920033.
[I 2025-12-08 14:34:50,368] Trial 2 finished with value: 0.7377583293309854 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.10434835235730122, 'class_weight': None, 'max_iter': 200, 'tol': 0.0014925931866630819}. Best is trial 2 with value: 0.7377583293309854.
[I 2025-12-08 14:34:50,736] Trial 3 finished with value: 0.7314460108536005 and parameters: {'solver': 'lbfgs', 'C': 0.750360544024799, 'class_weight': 'balanced', 'max

In [None]:
best_params = study.best_params

final_lr = LogisticRegression(
    C=best_params["C"],
    solver=best_params["solver"],
    penalty=best_params.get("penalty", "l2"),
    class_weight=best_params["class_weight"],
    max_iter=best_params["max_iter"],
    tol=best_params["tol"],
    random_state=42
)

final_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', final_lr)
])

final_pipe.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report

y_pred = final_pipe.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.7327457424559307
              precision    recall  f1-score   support

           0       0.77      0.82      0.79      4242
           1       0.65      0.59      0.62      2452

    accuracy                           0.73      6694
   macro avg       0.71      0.70      0.71      6694
weighted avg       0.73      0.73      0.73      6694



## handling outliers using log transformation of skewed features, also using robust scaling 
since the smoker dataset has significant amount of outliers, and standard scaling is based on mean and stand deviation to which outliers are highly sensitive, so i though of using robustscaler which is based on median and can help us here

In [23]:
import numpy as np

skewed_cols = [
    'triglyceride', 'LDL', 'Gtp',
    'AST', 'ALT', 'serum creatinine',
    'fasting blood sugar'
]

for col in skewed_cols:
    X_train[col] = np.log1p(X_train[col])
    X_val[col] = np.log1p(X_val[col])


In [24]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

def objective(trial):
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
    if solver == "lbfgs":
        penalty = "l2"
    else:
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    
    C = trial.suggest_float("C", 1e-6, 1e4, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    max_iter = trial.suggest_categorical("max_iter", [200, 500, 800])
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)
    
    model = LogisticRegression(
        C=C,
        solver=solver,
        penalty=penalty,
        class_weight=class_weight,
        max_iter=max_iter,
        tol=tol,
        random_state=42
    )
    
    pipe = Pipeline([
        ('scaler', RobustScaler()),
        ('clf', model)
    ])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv=cv,
        scoring="accuracy",
        n_jobs=1
    )
    
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150, show_progress_bar=True)

print("Best Parameters:", study.best_params)
print("Best CV Accuracy:", study.best_value)


[I 2025-12-08 15:17:24,004] A new study created in memory with name: no-name-82b5c040-c47f-427b-95b0-e3b0049b3e28


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-12-08 15:17:24,871] Trial 0 finished with value: 0.7374595152467661 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 266.7161096965639, 'class_weight': None, 'max_iter': 800, 'tol': 1.3532206021384135e-05}. Best is trial 0 with value: 0.7374595152467661.
[I 2025-12-08 15:17:25,134] Trial 1 finished with value: 0.7239760075366218 and parameters: {'solver': 'lbfgs', 'C': 0.0013877764505842046, 'class_weight': 'balanced', 'max_iter': 800, 'tol': 7.142755220737724e-05}. Best is trial 0 with value: 0.7374595152467661.
[I 2025-12-08 15:17:25,430] Trial 2 finished with value: 0.7004074756536907 and parameters: {'solver': 'lbfgs', 'C': 0.00024322293970936113, 'class_weight': None, 'max_iter': 500, 'tol': 0.006516093593567409}. Best is trial 0 with value: 0.7374595152467661.
[I 2025-12-08 15:17:25,743] Trial 3 finished with value: 0.7313338823264333 and parameters: {'solver': 'lbfgs', 'C': 0.034573628525443295, 'class_weight': 'balanced', 'max_iter': 500, 'tol': 1.140151284

In [25]:
best_params = study.best_params

final_lr = LogisticRegression(
    C=best_params["C"],
    solver=best_params["solver"],
    penalty=best_params.get("penalty", "l2"),
    class_weight=best_params["class_weight"],
    max_iter=best_params["max_iter"],
    tol=best_params["tol"],
    random_state=42
)

final_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('clf', final_lr)
])

final_pipe.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report

y_pred = final_pipe.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.7349865551239916
              precision    recall  f1-score   support

           0       0.78      0.81      0.80      4242
           1       0.65      0.60      0.62      2452

    accuracy                           0.73      6694
   macro avg       0.71      0.71      0.71      6694
weighted avg       0.73      0.73      0.73      6694



### extensive optuna search with ranges based on above results 

In [26]:
def objective(trial):
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
    if solver == "lbfgs":
        penalty = "l2"
        dual = False
    else:
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        dual = trial.suggest_categorical("dual", [False])  # liblinear safe
    

    C = trial.suggest_float("C", 1e-8, 1e6, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
    max_iter = trial.suggest_categorical("max_iter", [100, 200, 400, 800, 1500])
    tol = trial.suggest_float("tol", 1e-6, 1e-1, log=True)
    fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])
    
    model = LogisticRegression(
        C=C,
        solver=solver,
        penalty=penalty,
        dual=dual,
        class_weight=class_weight,
        max_iter=max_iter,
        tol=tol,
        fit_intercept=fit_intercept,
        random_state=42
    )
    
    pipe = Pipeline([
        ("scaler", RobustScaler()),
        ("clf", model)
    ])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv=cv,
        scoring="accuracy",
        n_jobs=1
    )
    
    return scores.mean()

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=300, show_progress_bar=True)

print("Best Parameters:", study.best_params)
print("Best CV Accuracy:", study.best_value)

[I 2025-12-08 15:27:05,623] A new study created in memory with name: no-name-d20dc2a8-87bf-4214-a938-eac92d7b29a7


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-12-08 15:27:05,960] Trial 0 finished with value: 0.6337354794987351 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'dual': False, 'C': 1.5284842437384096e-06, 'class_weight': None, 'max_iter': 1500, 'tol': 0.014528246637516038, 'fit_intercept': True}. Best is trial 0 with value: 0.6337354794987351.
[I 2025-12-08 15:27:06,231] Trial 1 finished with value: 0.6337354794987351 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'dual': False, 'C': 0.0001194559984944381, 'class_weight': None, 'max_iter': 800, 'tol': 0.0003725393839578887, 'fit_intercept': True}. Best is trial 0 with value: 0.6337354794987351.
[I 2025-12-08 15:27:06,467] Trial 2 finished with value: 0.6936839704129689 and parameters: {'solver': 'lbfgs', 'C': 8.141835092682802e-08, 'class_weight': 'balanced', 'max_iter': 100, 'tol': 4.075596440072871e-06, 'fit_intercept': True}. Best is trial 2 with value: 0.6936839704129689.
[I 2025-12-08 15:27:06,707] Trial 3 finished with value: 0.7200912151686725 an

In [None]:
best_params = study.best_params

final_lr = LogisticRegression(
    C=best_params["C"],
    solver=best_params["solver"],
    penalty=best_params.get("penalty", "l2"),
    dual=best_params.get("dual", False),
    class_weight=best_params["class_weight"],
    max_iter=best_params["max_iter"],
    tol=best_params["tol"],
    fit_intercept=best_params["fit_intercept"],
    random_state=42
)

final_pipe = Pipeline([
    ("scaler", RobustScaler()),
    ("clf", final_lr)
])

final_pipe.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report

y_pred = final_pipe.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.7337914550343592
              precision    recall  f1-score   support

           0       0.78      0.81      0.79      4242
           1       0.65      0.60      0.62      2452

    accuracy                           0.73      6694
   macro avg       0.71      0.71      0.71      6694
weighted avg       0.73      0.73      0.73      6694



diminishing results show that i've reached the threshold using Logistic Regression model