# Model training Diabetes Prediction Challenge

## 1. Imports 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

import optuna

## 2. Helper functions

In [None]:
def create_my_balanced_accuracy(weights):
    """
    Creates scoring function for cross_val_score with weights 
    """
    def my_scoring(est,X,y):
        preds = est.predict(X)
        wagi = np.asarray([weights[i] for i in y])

        score = balanced_accuracy_score(y,preds,sample_weight=wagi)

        return score
    
    return my_scoring


In [None]:
def calculate_weights(y: pd.Series):
    """
    Calculates weights for inbalanced class datasets.
    Returns the dictionary with the weights for each class.

    Args:
        y: The target variable
    """
    values = y.value_counts()
    weights = {}
    total = len(y)

    for class_label, count in values.items():
        other_counts = total - count
        weights[class_label] = 1.0 / other_counts if other_counts > 0 else 0

    return weights
        

In [None]:
def create_transformer(scaler_name,quantitative_columns,categorical_columns):
    if scaler_name == "StandardScaler":
        scaler = StandardScaler()
    elif scaler_name == "RobustScaler":
        scaler = RobustScaler()
    elif scaler_name == "MinMaxScaler":
        scaler = MinMaxScaler()

    numeric_transformer = Pipeline([
        ('inputer', SimpleImputer(strategy='median')),
        ('scaler', scaler)
    ])

    categorical_transformer = Pipeline([
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, quantitative_columns),
            ('cat', categorical_transformer, categorical_columns)
        ],
        remainder='passthrough',
        n_jobs=-1
    )

    return preprocessor

In [None]:
def create_best_model(X_train,y_train,weights,quantitative_columns,categorical_columns):
    skf = StratifiedKFold(n_splits=5,shuffle=True)

    def objective(trial:optuna.trial.Trial):
        n_features = trial.suggest_int('n_features',3,X_train.shape[1])
        model_name = trial.suggest_categorical('model_name',["SVC","KNN"])
        estimator_name = trial.suggest_categorical("estimator_name",["SVC","GBC"])

        if model_name == "SVC":
            degree = 3
            C = trial.suggest_float("C", 0.001, 100, log=True)
            kernel = trial.suggest_categorical("kernel", ["rbf","poly","linear"])

            if kernel == "poly":
                degree = trial.suggest_int("degree",2,5)

            model = SVC(C=C, kernel=kernel, degree=degree)
        elif model_name == "KNN":
            n_neighbors = trial.suggest_int("n_neighbors", 3, 16)
            weights_knn = trial.suggest_categorical("weights_knn",["uniform","distance"])

            model = KNeighborsClassifier(n_neighbors=n_neighbors,weights=weights_knn)

        preprocessor_name = trial.suggest_categorical("preprocessor",["StandardScaler","RobustScaler","MinMaxScaler"])
        #imputer_strategy = trial.suggest_categorical("imputer_strategy",['median',"mean"])

        #preprocessor = create_transformer(preprocessor_name,imputer_strategy,quantitative_columns,categorical_columns)
        preprocessor = create_transformer(preprocessor_name,quantitative_columns,categorical_columns)


        if estimator_name == "SVC":
            estimator = SVC(kernel="linear")
        elif estimator_name == "GBC":
            estimator = GradientBoostingClassifier()

        pipe = Pipeline([
            ("Preprocessor", preprocessor),
            ("RFE",RFE(estimator=estimator,n_features_to_select=n_features)),
            ("Classifier",model)
        ])

        scorer = create_my_balanced_accuracy(weights)
        print("Cross Val score")
        scores = cross_val_score(
            pipe,
            X_train,
            y_train,
            cv=skf,
            scoring=scorer,
            n_jobs=1
        )
        print("END Cross Val Score")
        return scores.mean()

    study = optuna.create_study(direction="maximize")
    study.optimize(objective,n_trials=30)

    return study.best_params, study.best_value


## 3. Training

In [None]:
train_data = pd.read_csv("../Data/DiabetesPredictionChallenge/train.csv")
test_data = pd.read_csv("../Data/DiabetesPredictionChallenge/test.csv")

In [None]:
X_train = train_data.drop(columns=["id","diagnosed_diabetes"],axis=1)
y_train = train_data['diagnosed_diabetes']

X_test = test_data.drop(columns=["id"],axis=1)

In [None]:
X_train.isna().sum()

In [None]:
categorical_columns = ["gender","ethnicity","education_level","income_level","smoking_status","employment_status","family_history_diabetes","hypertension_history","cardiovascular_history"]
quantitative_columns = [name for name in train_data.columns if name not in categorical_columns and name not in ["id","diagnosed_diabetes"]]

In [None]:
# preporcessor = create_transformer("StandardScaler","constant",quantitative_columns,categorical_columns)

# preporcessor.fit(X_train,y_train)
# dir(preporcessor)
# data = preporcessor.transform(X_train)

In [None]:
#data[:10]

In [None]:
weights = calculate_weights(y_train)
best_params,best_score = create_best_model(X_train,y_train,weights,quantitative_columns,categorical_columns)

In [None]:
if best_params['estimator_name'] == "SVC":
    estimator = SVC(kernel='linear')
elif best_params['estimator_name'] == "GBC":
    estimator = GradientBoostingClassifier()

preprocessor = create_transformer(best_params['preprocessor_name'],best_params['imputer_strategy'],quantitative_columns,categorical_columns)

if best_params['model_name'] == "KNN":
    model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],weights=best_params['weights_knn'])
elif best_params['model_name'] == "SVC":
    model = SVC(C=best_params["C"],kernel=best_params['kernel'],degree=best_params['degree'])

pipe = Pipeline([
    ("Preprocessor",preprocessor),
    ("RFE",RFE(estimator=estimator,n_features_to_select=best_params['n_features'])),
    ('Classifier', model)
])

pipe.fit(X_train,y_train)