In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', 50)

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Load dataset

In [7]:
df = pd.read_csv('data/train.csv')
print(df.shape)
df.head()

(159256, 24)


Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,87,94,172,300,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,83,147,194,55,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,75,79,178,197,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,88,91,180,203,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,76,91,155,87,44,93,15.4,1,0.8,19,13,17,0,1


In [8]:
features = df.drop(columns=['id','smoking']).columns
print(features)
print(len(features))

Index(['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries'],
      dtype='object')
22


## Try models

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

X = df.drop(columns=['id', 'smoking'])
y = df['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

models = {
    'Logistic Regression': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())]),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
}

# Обучим модели и оценим ROC AUC
results = {}
for model_name, model in models.items():
    
    if model_name=='Gradient Boosting':
        model.fit(X_train.iloc[:10000], y_train.iloc[:10000])
    else:
        model.fit(X_train, y_train)
        
    y_pred = model.predict_proba(X_test)[:, 1]  # 
    auc = roc_auc_score(y_test, y_pred)
    results[model_name] = round(auc,4)

results

{'Logistic Regression': 0.8314,
 'Random Forest': 0.8549,
 'Gradient Boosting': 0.8524}

In [12]:
results['Best possible result (Ensembles)'] = 0.8788 # https://www.kaggle.com/code/arunklenin/ps3e24-smoking-cessation-prediction-binary/notebook

In [13]:
results

{'Logistic Regression': 0.8314,
 'Random Forest': 0.8549,
 'Gradient Boosting': 0.8524,
 'Best possible result (Ensembles)': 0.8788}

In [17]:
pd.DataFrame([results]).T

Unnamed: 0,0
Logistic Regression,0.8314
Random Forest,0.8549
Gradient Boosting,0.8524
Best possible result (Ensembles),0.8788
