In [10]:
from __future__ import annotations
from pathlib import Path
from typing import Dict, List, Tuple
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, RobustScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin

pd.set_option('display.max_columns', None)
sns.set_theme(style='whitegrid')

def find_project_root() -> Path:
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / "Data").exists():
            return candidate
    raise RuntimeError("Nevar atrast projektu ar mapi 'Data'")

REPO_ROOT = find_project_root()
DATA_PATH = Path("/Users/enijabrakse/datizrace-proj-1/Data/03 smoking and drinking/smoking_driking_dataset_Ver01.csv")
TARGET = 'SMK_stat_type_cd'



In [11]:
df = pd.read_csv(DATA_PATH)

df[TARGET] = df[TARGET].astype(int).astype(str)
df["DRK_YN"] = df["DRK_YN"].astype(str)
df['urine_protein'] = df['urine_protein'].astype(int).astype(str)
df['hear_left'] = df['hear_left'].astype(int).astype(str)
df['hear_right'] = df['hear_right'].astype(int).astype(str)
df["sex"] = df["sex"].astype(str)

X = df.drop(columns=[TARGET])
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_train.shape, X_test.shape

((693942, 23), (297404, 23))

In [12]:
numeric_cols = [col for col in X_train.select_dtypes(include=['int64', 'float64']).columns
                if col not in {'hear_left', 'hear_right', 'urine_protein', 'DRK_YN'}]
skewed_cols = ['triglyceride', 'waistline', 'HDL_chole', 'LDL_chole', 'SGOT_AST', 'SGOT_ALT']
pure_numeric = [col for col in numeric_cols if col not in skewed_cols]
categorical_cols = ['sex', 'DRK_YN', 'urine_protein', 'hear_left', 'hear_right']

(pure_numeric, skewed_cols, categorical_cols)

(['age',
  'height',
  'weight',
  'sight_left',
  'sight_right',
  'SBP',
  'DBP',
  'BLDS',
  'tot_chole',
  'hemoglobin',
  'serum_creatinine',
  'gamma_GTP'],
 ['triglyceride',
  'waistline',
  'HDL_chole',
  'LDL_chole',
  'SGOT_AST',
  'SGOT_ALT'],
 ['sex', 'DRK_YN', 'urine_protein', 'hear_left', 'hear_right'])

In [13]:
class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.005, upper=0.995):
        self.lower = lower
        self.upper = upper

    def fit(self, X, y=None):
        X_np = np.asarray(X, dtype=float)
        self.lower_bounds_ = np.nanquantile(X_np, self.lower, axis=0)
        self.upper_bounds_ = np.nanquantile(X_np, self.upper, axis=0)
        return self

    def transform(self, X):
        X_np = np.asarray(X, dtype=float)
        return np.clip(X_np, self.lower_bounds_, self.upper_bounds_)

In [14]:
numeric_pipeline = Pipeline([
    ('clip', QuantileClipper()),
    ('scale', RobustScaler())
])

skew_pipeline = Pipeline([
    ('clip', QuantileClipper()),
    ('log', FunctionTransformer(np.log1p, validate=False)),
    ('scale', RobustScaler())
])

categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, pure_numeric),
    ('skew', skew_pipeline, skewed_cols),
    ('cat', categorical_pipeline, categorical_cols)
])
preprocessor

0,1,2
,transformers,"[('num', ...), ('skew', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,lower,0.005
,upper,0.995

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,lower,0.005
,upper,0.995

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [15]:
models = {
    'logreg': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'rf': RandomForestClassifier(random_state=42),
    'gb': GradientBoostingClassifier(random_state=42)
}

param_grid = {
    'logreg': {
        'model__C': [0.1, 1.0],
        'model__solver': ['lbfgs', 'saga']
    },
    'rf': {
        'model__n_estimators': [200, 300],
        'model__max_depth': [15, 25]
    },
    'gb': {
        'model__learning_rate': [0.05, 0.1],
        'model__n_estimators': [150, 250]
    }
}
models

{'logreg': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
 'rf': RandomForestClassifier(random_state=42),
 'gb': GradientBoostingClassifier(random_state=42)}

In [17]:
results = []
for name, model in models.items():
    pipeline = Pipeline([('preprocess', preprocessor), ('model', model)])
    grid = GridSearchCV(
        pipeline,
        param_grid[name],
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
    )
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    y_proba = grid.predict_proba(X_test)
    class_order = grid.best_estimator_.named_steps["model"].classes_
    class_to_index = {label: idx for idx, label in enumerate(class_order)}
    y_test_encoded = y_test.map(class_to_index).to_numpy(dtype=int)
    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred, average="macro"),
        "roc_auc": roc_auc_score(y_test_encoded, y_proba, multi_class="ovr"),
    }
    results.append((name, metrics_dict, grid))
    print(f"Model {name}: {metrics_dict}")
results


Model logreg: {'accuracy': 0.6786290702209788, 'f1': 0.6142378562000202, 'roc_auc': 0.8399509658136615}
Model rf: {'accuracy': 0.7053166736156877, 'f1': 0.591042869746757, 'roc_auc': 0.8506854943614911}
Model gb: {'accuracy': 0.7082924237737219, 'f1': 0.6048225030117678, 'roc_auc': 0.8529281629762937}


[('logreg',
  {'accuracy': 0.6786290702209788,
   'f1': 0.6142378562000202,
   'roc_auc': 0.8399509658136615},
  GridSearchCV(cv=3,
               estimator=Pipeline(steps=[('preprocess',
                                          ColumnTransformer(transformers=[('num',
                                                                           Pipeline(steps=[('clip',
                                                                                            QuantileClipper()),
                                                                                           ('scale',
                                                                                            RobustScaler())]),
                                                                           ['age',
                                                                            'height',
                                                                            'weight',
                                                   

In [18]:
results_sorted = sorted(results, key=lambda x: x[1]['accuracy'], reverse=True)
best_name, best_metrics, best_grid = results_sorted[0]
print("Best model:", best_name, best_metrics)

Best model: gb {'accuracy': 0.7082924237737219, 'f1': 0.6048225030117678, 'roc_auc': 0.8529281629762937}
