# Smoker Status Prediction - Neural Network


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

print("Libraries loaded successfully.")

Libraries loaded successfully.


## Loading and Preprocessing Data

In [3]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

if train_df.duplicated().sum() > 0:
    train_df = train_df.drop_duplicates()

skewed_cols = [
    'triglyceride', 'LDL', 'Gtp',
    'AST', 'ALT', 'serum creatinine',
    'fasting blood sugar'
]

for col in skewed_cols:
    train_df[col] = np.log1p(train_df[col])
    test_df[col] = np.log1p(test_df[col])

X = train_df.drop('smoking', axis=1)
y = train_df['smoking']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X.columns)
test_scaled = pd.DataFrame(test_scaled, columns=test_df.columns)

print("Preprocessing complete with RobustScaler.")


Train dataset shape: (38984, 23)
Test dataset shape: (16708, 22)
Preprocessing complete with RobustScaler.


In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

nn_model = MLPClassifier(
    hidden_layer_sizes=(128),   
    activation='logistic',          
    solver='adam',               
    learning_rate='constant',
    max_iter=100,               
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.75560
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      4242
           1       0.66      0.67      0.67      2452

    accuracy                           0.76      6694
   macro avg       0.74      0.74      0.74      6694
weighted avg       0.76      0.76      0.76      6694



i tried multiple max iterations and 100 gave me the best accuracy as it's more or less significant number of iterations, after some manual tunings i found out that adam and sgd gave the best solver's accuracy, so i though of running them both for better accuracy, also i saw that adam solver with logistic or sigmoid activation function was giving the best accuracy
adding more layers in Neural Network to increase the accuracy

In [9]:
nn_model = MLPClassifier(
    hidden_layer_sizes=(128,64,32),   
    activation='logistic',          
    solver='adam',               
    learning_rate='constant',
    max_iter=100,               
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.75426
              precision    recall  f1-score   support

           0       0.82      0.78      0.80      4242
           1       0.65      0.71      0.68      2452

    accuracy                           0.75      6694
   macro avg       0.74      0.75      0.74      6694
weighted avg       0.76      0.75      0.76      6694



In [10]:
nn_model = MLPClassifier(
    hidden_layer_sizes=(128,64),   
    activation='logistic',          
    solver='adam',               
    learning_rate='constant',
    max_iter=100,               
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.75710
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      4242
           1       0.67      0.68      0.67      2452

    accuracy                           0.76      6694
   macro avg       0.74      0.74      0.74      6694
weighted avg       0.76      0.76      0.76      6694



we can see that increasing layers does not increase the accuracy everytime, as the dataset is small and not complex enough to require multiple layers and so deeper model overfit quickly.
Based on above observations, i'll do an optuna seatch in order to tune the best hyperparameters and to get the best accuracy.

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128),   
    activation='logistic',          
    solver='adam',               
    learning_rate='constant',
    max_iter=100,               
    random_state=42,
    alpha=1e-4
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.75471
              precision    recall  f1-score   support

           0       0.82      0.78      0.80      4242
           1       0.65      0.71      0.68      2452

    accuracy                           0.75      6694
   macro avg       0.74      0.75      0.74      6694
weighted avg       0.76      0.75      0.76      6694



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

train_df = pd.read_csv("train_dataset.csv")

train_df["BMI"] = train_df["weight(kg)"] / ((train_df["height(cm)"] / 100) ** 2)
train_df["chol_ratio"] = train_df["LDL"] / train_df["HDL"]
train_df["liver_ratio"] = train_df["Gtp"] / train_df["ALT"]
train_df["sugar_liver"] = train_df["fasting blood sugar"] / train_df["Gtp"]
train_df["age_group"] = pd.cut(train_df["age"], bins=[0,30,40,50,60,100], labels=[1,2,3,4,5])

X = train_df.drop("smoking", axis=1)
y = train_df["smoking"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,64),
    activation='logistic',
    solver='adam',
    learning_rate='constant',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)
pred = nn_model.predict(X_val_scaled)

print("Validation Accuracy:", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))


Validation Accuracy: 0.7463126843657817
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      4933
           1       0.65      0.66      0.66      2864

    accuracy                           0.75      7797
   macro avg       0.73      0.73      0.73      7797
weighted avg       0.75      0.75      0.75      7797



