# TODO
## Feature engineering 
### Add a new feature based on BMI using following data sourced from CDC website
* If your BMI is less than 18.5, it falls within the underweight range.
* If your BMI is 18.5 to <25, it falls within the healthy weight range.
* If your BMI is 25.0 to <30, it falls within the overweight range.
* If your BMI is 30.0 or higher, it falls within the obesity range.

Obesity types:

Obesity is frequently subdivided into categories:
* Class 1: BMI of 30 to < 35
* Class 2: BMI of 35 to < 40
* Class 3: BMI of 40 or higher. Class 3 obesity is sometimes categorized as “severe” obesity.

### New feature based on average glucose which will indicate disabetes
using the chart at this site https://www.medicalnewstoday.com/articles/a1c-chart-diabetes-numbers#how-it-works
OR this,https://www.cdc.gov/diabetes/basics/getting-tested.html
both contain different ranges. we'll try both

### Lets try dropping the avg_glucose and bmi columns after we add new features to see what it does

In [None]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e2/")

train = pd.read_csv(BASE_DIR / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_DIR / "test.csv").drop(columns="id")
train.head()

In [None]:
df = pd.concat([train.drop(columns=["stroke"]), test], axis=0).reset_index(drop=True)
df

# Utils

In [None]:
def plot_feature_importances(cols, feat_imps):
    fig = plt.figure(figsize = (15, 0.35*len(feat_imps)))
    feature_imp_df = pd.DataFrame(data=zip(cols, feat_imps), columns=["feature", "importance"]).sort_values(by="importance", ascending=False)
    plt.title('Feature importances', size=25, y=1.05)
    sns.barplot(data=feature_imp_df, x='importance', y='feature')
    plt.show()

# Some feature engineering :D 

Before we do that, let's train an xgboost model and see how it performs, then add features and see if they improve anything!

# Preprocessing

In [None]:
df = pd.get_dummies(df)
df.head()

In [None]:
X = df.iloc[:-len(test), :]

test_new = df.iloc[-len(test):, :]

# the unprocessed and raw train dataframe that we loaded earlier
y = train.stroke

In [None]:
def cross_validate(X, y, model):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337) # thumbs up if you're 1337 gang :D jk
    
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                
        # training
        model.fit(X_train, y_train, verbose=0)

        # predicting
        y_pred = model.predict_proba(X_val)[:, 1]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold} \t auc: {auc}")
        
        cv_scores.append(auc)
    
    avg_auc = np.mean(cv_scores)
    print(f"Avg AUC: {avg_auc}")

In [None]:
xgb_model = xgb.XGBClassifier()

In [None]:
cross_validate(X, y, xgb_model)

In [None]:
# training on all data
xgb_model = xgb.XGBClassifier(n_estimators=300)
xgb_model.fit(X, y)

In [None]:
len(X.columns) == len(xgb_model.feature_importances_)

#### lets plot feature importances

In [None]:
feat_imp_df = pd.DataFrame(data = zip(X.columns, xgb_model.feature_importances_), columns=["feature", "importance"])
feat_imp_df.sort_values(by="importance")

In [None]:
plot_feature_importances(X.columns, xgb_model.feature_importances_)

In [None]:
# let's first drop non-imp features and see if that improves anything!
non_imp_feats = feat_imp_df[feat_imp_df.importance <= 0.0].feature
non_imp_feats

In [None]:
df_2 = df.drop(columns=non_imp_feats)

X_2 = df_2.iloc[:-len(test), :]

In [None]:
len(df_2.columns) < len(d)

In [None]:
xgb_model = xgb.XGBClassifier()
cross_validate(X_2, y, xgb_model)

## INSIGHTS: Nay, nothing changed, the score remains the same!

# lets add features bmi based features and see if it improves anything!

## Feature engineering : add a new feature based on BMI using following data sourced from CDC website
* If your BMI is less than 18.5, it falls within the underweight range.
* If your BMI is 18.5 to <25, it falls within the healthy weight range.
* If your BMI is 25.0 to <30, it falls within the overweight range.
* If your BMI is 30.0 or higher, it falls within the obesity range.

Obesity types:

Obesity is frequently subdivided into categories:
* Class 1: BMI of 30 to < 35
* Class 2: BMI of 35 to < 40
* Class 3: BMI of 40 or higher. Class 3 obesity is sometimes categorized as “severe” obesity.

In [None]:
def bmi_level(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi >= 18.5 and bmi < 25.0:
        return "healthy"
    elif bmi >= 25.0 and bmi < 30.0:
        return "overweight"
    elif bmi >= 30.0 and bmi < 35.0:
        return "obese_class1"
    elif bmi >= 35.0 and bmi < 40.0:
        return "obese_class2"
    elif bmi >= 40.0:
        return "obese_class3"

In [None]:
df["bmi_level"] = df.bmi.map(bmi_level)
df.head(3)

### Let's one hot encode the bmi_level column

In [None]:
df = pd.get_dummies(df)
df.head()

In [None]:
X = df.iloc[:-len(test), :]

In [None]:
xgb_model = xgb.XGBClassifier()

In [None]:
cross_validate(X, y, xgb_model)

In [None]:
feat_imp_df = pd.DataFrame(data=zip(X.columns, xgb_model.feature_importances_), columns=["feature", "importance"])
feat_imp_df.sort_values(by="importance")

In [None]:
plot_feature_importances(feat_imp_df)

### INSIGHTS: Some of the new features are important!

# Let's add diabetes features based on glucose levels

In [None]:
def diabetes_indicator(avg_glucose_level):
    if avg_glucose_level <= 99:
        return "normal"
    elif avg_glucose_level >= 100 and avg_glucose_level <= 125:
        return "prediabetic"
    elif avg_glucose_level <= 200:
        return "type1"
    else:
        return "type2"

df.avg_glucose_level.map(diabetes_indicator).value_counts()

In [None]:
df["diabetes"] = df.avg_glucose_level.map(diabetes_indicator)
df.head(3)

### One hot encoding the diabetes feature

In [None]:
# # drop previous diabetes columns
# diabetes_cols = [col for col in df.columns if col.startswith("diabetes")]
# diabetes_cols

In [None]:
# df.drop(columns=diabetes_cols, axis=1, inplace=True)

In [None]:
df = pd.get_dummies(df)
df.head(3)

In [None]:
X = df.iloc[:-len(test), :]

In [None]:
xgb_model = xgb.XGBClassifier()

In [None]:
cross_validate(X, y, xgb_model)

In [None]:
plot_feature_importances(X.columns, xgb_model.feature_importances_)

### INSIGHTS: Some of the new diabetes feaures like prediabetic and type_1 seem to be important

# Dropping the bmi and glucose columsn

In [None]:
df_x = df.drop(columns=["bmi", "avg_glucose_level"], axis=1)

In [None]:
X = df_x.iloc[:-len(test), :]

xgb_model = xgb.XGBClassifier()

cross_validate(X, y, xgb_model)

### INSIGHTS: So droing these two worsens the performance

# SIDE NOTES:
### Feature engineering glucose levels
1. ##### First using the ranges from CDC  
    * First we binned glucose levels in type1 and type 2 diabetes in addition to normal and prediabetic and the score was **0.8625**
    * Then we binned type1 and type2 into single diabetic column and score was **0.8611**
2. #### Tried the other range linked above and it worsened the score to **0.8585**