In [29]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from typing import Optional, Tuple
import os
import logging

# configure our logger

logging.basicConfig(level=logging.INFO)
logger =logging.getLogger(__name__)

In [3]:
# Download latest version
path = kagglehub.dataset_download("iammustafatz/diabetes-prediction-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/iammustafatz/diabetes-prediction-dataset/versions/1


In [17]:
# create a data url
data_url = os.path.join(path, os.listdir(path)[0])

# create the pandas dataframe
data = pd.read_csv(data_url)
data.head(5)


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [9]:
list(data.select_dtypes(include ="object").columns)

['gender', 'smoking_history']

In [19]:
# label encode the categorical variables.


def encode_categoricals(data: pd.DataFrame) -> pd.DataFrame:
    try:
        label_encoders = {}
        categorical_columns = list(data.select_dtypes(include ="object").columns)
        for column in categorical_columns:
            encoder = LabelEncoder()
            data[column] = encoder.fit_transform(data[column])
            label_encoders[column] = encoder
        logger.info(f'Encoded the categorical columns successfully')
    except Exception as err:
        logger.error(f'Encountered some error {err} while encoding')

    return data

data = encode_categoricals(data=data)
data.head()



INFO:__main__:Encoded the categorical columns successfully


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [21]:
# split the dataset

def split_dataset(data: pd.DataFrame) -> Tuple[
    Optional[pd.DataFrame],
    Optional[pd.DataFrame],
    Optional[pd.Series],
    Optional[pd.Series]]:
    """this function returns the splitted version of the dataset
    ready for training."""
    X_train, X_test, y_train, y_test = None,None,None,None
    try:
        X = data.drop(columns=['diabetes'])
        y = data['diabetes']
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,
                                                            random_state=23, stratify=y)
        logger.info(f'Splitted the dataframe successfully')
    except Exception as e:
        logger.error(f'An error occured. Details: {e}')
    
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = split_dataset(data=data)     

INFO:__main__:Splitted the dataframe successfully


In [26]:
# scale the dataset

def scale_dataset(X_train:pd.DataFrame, X_test:pd.DataFrame) -> Tuple[
    Optional[pd.DataFrame],
    Optional[pd.DataFrame]]:
    """scale the features of the splitted dataset using standard scaler"""
    X_train, X_test
    try:
        scaler = StandardScaler()
        columns = X_train.columns
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        X_train = pd.DataFrame(data = X_train, columns=columns)
        X_test = pd.DataFrame(data=X_test, columns = columns)

        logger.info('Completed scaling X_train and X_test')
    except Exception as e:
        X_train, X_test = None, None
        logger.error(f'An error occured. Details: {e}')
    
    return X_train, X_test

X_train, X_test = scale_dataset(X_train, X_test)

INFO:__main__:Completed scaling X_train and X_test


In [30]:
# model training

def model_training(X_train: pd.DataFrame, X_test:pd.DataFrame,
                   y_train:pd.Series, y_test:pd.Series) -> Optional[RandomForestClassifier]:
    """This function trains the base model. """
    model = None
    try:
        model = RandomForestClassifier(random_state= 23)
        model.fit(X_train, y_train)
        train_preds = model.predict(X_train)
        test_preds = model.predict(X_test)

        logger.info(msg = f"""Training completed with the following metrics.
                    train_precision: {precision_score(y_train, train_preds)},
                    test_precision: {precision_score(y_test, test_preds)},
                    train_recall: {recall_score(y_train, train_preds)},
                    test_recall: {recall_score(y_test, test_preds)},
                    train_f1: {f1_score(y_train, train_preds)},
                    test_f1: {f1_score(y_test, test_preds)}""")
    except Exception as e:
        logger.error(f"An error occured. Details: {e}")
    
    return model


model = model_training(X_train, X_test, y_train, y_test)

INFO:__main__:Training completed with the following metrics.
                    train_precision: 0.9994067922289782,
                    test_precision: 0.9518072289156626,
                    train_recall: 0.9910294117647059,
                    test_recall: 0.6970588235294117,
                    train_f1: 0.9952004725688548,
                    test_f1: 0.8047538200339559
