In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Reading data

In [14]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

In [15]:
train_data.describe()
train_data.drop(columns=["ADM_RNO1" , "Province" , "Health_region_ grouped" , "Edu_level" , "Aboriginal_identity"] , axis = 1 , inplace = True)
test_data.drop(columns=["ADM_RNO1" , "Province" , "Health_region_ grouped" , "Edu_level" , "Aboriginal_identity"] , axis = 1 , inplace = True)

# Preprocess

In [16]:
test_data.columns[test_data.isna().any()].tolist()

['Smoked']

In [17]:
# TODO:
train_data.dropna(inplace=True)
ss = test_data['Smoked'].median()
test_data.fillna( ss, inplace=True)
test_data['Smoked'].unique()

array([996.,  15.,  12.,  25., 999.,   5.,   1.,   2.,  20.,   4.,   3.,
         8.,   6.,  30.,  17.,   9.,   7.,  11.,  13.,  10.,  18.,  16.,
        50.,  37.,  40.,  27.,  14.,  45.,  35.,  22.,  23.,  24.,  36.])

In [18]:
train_data.isnull().sum(), test_data.isnull().sum()

(Gender                        0
 Marital_status                0
 Household                     0
 Age                           0
 Worked_job_business           0
 Gen_health_state              0
 Life_satisfaction             0
 Mental_health_state           0
 Stress_level                  0
 Work_stress                   0
 Sense_belonging               0
 Weight_state                  0
 BMI_12_17                     0
 BMI_18_above                  0
 Sleep_apnea                   0
 High_BP                       0
 High_cholestrol               0
 Diabetic                      0
 Fatigue_syndrome              0
 Mood_disorder                 0
 Anxiety_disorder              0
 Respiratory_chronic_con       0
 Musculoskeletal_con           0
 Cardiovascular_con            0
 Health_utility_indx           0
 Pain_status                   0
 Act_improve_health            0
 Fruit_veg_con                 0
 Smoked                        0
 Tobaco_use                    0
 weekly_al

In [19]:
x = train_data.drop('Diabetic', axis=1)
y = train_data["Diabetic"].to_numpy()
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_data)

# Model Config 

In [23]:
from catboost import CatBoost, Pool , CatBoostClassifier
xgb_model = XGBClassifier(subsample=1.0, reg_lambda=0.4, reg_alpha=0.1, n_estimators=300, min_child_weight=5, max_depth=3, learning_rate=0.1, gamma=0.4, 
                          colsample_bytree=0.9, random_state=42, eval_metric='logloss')

rf_model = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_leaf_nodes=20, max_features=None, max_depth=6, criterion='gini', 
                                  class_weight=None, bootstrap=True, random_state=42)
log_model = LogisticRegression(max_iter = 1000)

cat_model = CatBoostClassifier(iterations=1000,learning_rate=0.03,depth=6,loss_function='Logloss',eval_metric='AUC' , silent=True)
xgb_model.fit(X_train_scaled, y_train-1)
rf_model.fit(X_train_scaled, y_train)
log_model.fit(X_train_scaled, y_train)
cat_model.fit(X_train_scaled, y_train)

score = round(f1_score(y_val, y_pred),3)*100
score_log = round(f1_score(y_val, y_pred_log),3)*100
score_xgb = round(f1_score(y_val, y_pred_xgb+1),3)*100
score_cat = round(f1_score(y_val, y_pred_cat),3)*100
print(score , score_log , score_xgb , score_cat)

64.7 61.6 65.60000000000001 64.8


In [21]:
from sklearn.metrics import f1_score
y_pred = rf_model.predict(X_val_scaled)
y_pred_log = log_model.predict(X_val_scaled)
y_pred_xgb = xgb_model.predict(X_val_scaled)
y_pred_cat = cat_model.predict(X_val_scaled)

score = round(f1_score(y_val, y_pred),3)*100
score_log = round(f1_score(y_val, y_pred_log),3)*100
score_xgb = round(f1_score(y_val, y_pred_xgb+1),3)*100
score_cat = round(f1_score(y_val, y_pred_cat),3)*100
print(score , score_log , score_xgb , score_cat) 


importances = rf_model.feature_importances_
feature_importances = pd.Series(importances, index=x.columns).sort_values(ascending=False)
top_n = 25
important_features = feature_importances.head(top_n).index


X_reduced = x[important_features]
X_train, X_val, y_train, y_val = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


rf_model.fit(X_train_scaled, y_train)
log_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train-1)
cat_model.fit(X_train_scaled, y_train)


y_pred = rf_model.predict(X_val_scaled)
y_pred_log = log_model.predict(X_val_scaled)
y_pred_xgb = xgb_model.predict(X_val_scaled)
y_pred_cat = cat_model.predict(X_val_scaled)

score = round(f1_score(y_val, y_pred),3)*100
score_log = round(f1_score(y_val, y_pred_log),3)*100
score_xgb = round(f1_score(y_val, y_pred_xgb+1),3)*100
score_cat = round(f1_score(y_val, y_pred_cat),3)*100
print(score , score_log , score_xgb , score_cat)


64.7 61.7 65.9 66.10000000000001
0:	total: 14.1ms	remaining: 14.1s
1:	total: 34.2ms	remaining: 17.1s
2:	total: 53.9ms	remaining: 17.9s
3:	total: 73ms	remaining: 18.2s
4:	total: 85.8ms	remaining: 17.1s
5:	total: 104ms	remaining: 17.3s
6:	total: 117ms	remaining: 16.6s
7:	total: 130ms	remaining: 16.1s
8:	total: 149ms	remaining: 16.4s
9:	total: 162ms	remaining: 16s
10:	total: 181ms	remaining: 16.3s
11:	total: 194ms	remaining: 15.9s
12:	total: 211ms	remaining: 16s
13:	total: 232ms	remaining: 16.3s
14:	total: 247ms	remaining: 16.3s
15:	total: 255ms	remaining: 15.7s
16:	total: 275ms	remaining: 15.9s
17:	total: 288ms	remaining: 15.7s
18:	total: 301ms	remaining: 15.5s
19:	total: 314ms	remaining: 15.4s
20:	total: 327ms	remaining: 15.2s
21:	total: 340ms	remaining: 15.1s
22:	total: 358ms	remaining: 15.2s
23:	total: 373ms	remaining: 15.2s
24:	total: 390ms	remaining: 15.2s
25:	total: 397ms	remaining: 14.9s
26:	total: 410ms	remaining: 14.8s
27:	total: 424ms	remaining: 14.7s
28:	total: 463ms	remaining

In [96]:
x_test = test_data[important_features]
test_scaled = scaler.transform(x_test)
predict = xgb_model.predict(test_scaled)+1
submission = pd.DataFrame(predict , columns=['diabet'])
submission

Unnamed: 0,diabet
0,1
1,1
2,1
3,2
4,2
...,...
5793,1
5794,2
5795,2
5796,2
