# Smoker Status Prediction - Support Vector Machine (SVM)


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

print("Libraries loaded successfully.")

Libraries loaded successfully.


## Loading and Preprocessing Data

In [6]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

if train_df.duplicated().sum() > 0:
    train_df = train_df.drop_duplicates()

skewed_cols = [
    'triglyceride', 'LDL', 'Gtp',
    'AST', 'ALT', 'serum creatinine',
    'fasting blood sugar'
]

for col in skewed_cols:
    train_df[col] = np.log1p(train_df[col])
    test_df[col] = np.log1p(test_df[col])

X = train_df.drop('smoking', axis=1)
y = train_df['smoking']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X.columns)
test_scaled = pd.DataFrame(test_scaled, columns=test_df.columns)

print("Preprocessing complete with RobustScaler.")


Train dataset shape: (38984, 23)
Test dataset shape: (16708, 22)
Preprocessing complete with RobustScaler.


### Basic SVM Model

In [7]:
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)

y_pred = svm_model.predict(X_val_scaled)

print(f"Validation Accuracy: {accuracy_score(y_val, y_pred):.5f}")
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.74335
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      4242
           1       0.65      0.65      0.65      2452

    accuracy                           0.74      6694
   macro avg       0.72      0.72      0.72      6694
weighted avg       0.74      0.74      0.74      6694

