In [None]:
pip install imbalanced-learn

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import torch
import os

In [None]:
path = "../data/diabetes_prediction_dataset.csv"
df = pd.read_csv(path)

print(len(df[df['diabetes']==0]), len(df[df['diabetes']==1]))  #nombre d'exemples par classe
df.head()

91500 8500


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [26]:
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop(columns=["diabetes"])
y = df["diabetes"]
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.0,0,1,4,25.19,6.6,140
1,0,54.0,0,0,0,27.32,6.6,80
2,1,28.0,0,0,4,27.32,5.7,158
3,0,36.0,0,0,1,23.45,5.0,155
4,1,76.0,1,1,1,20.14,4.8,155


In [35]:
# Split the dataset into training (80%), validation (10%), and test (10%) 

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

In [27]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [None]:
print("Before SMOTE:", Counter(y_train))
print("After SMOTE :", Counter(y_train_smote))

Before SMOTE: Counter({0: 73200, 1: 6800})
After SMOTE : Counter({1: 73200, 0: 73200})


In [37]:
X_train_tensor = torch.tensor(X_train_smote, dtype=torch.float32)
X_val_tensor   = torch.tensor(X_val_scaled, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test_scaled, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train_smote.values, dtype=torch.long)
y_val_tensor   = torch.tensor(y_val.values, dtype=torch.long)
y_test_tensor  = torch.tensor(y_test.values, dtype=torch.long)

In [38]:
fold = "../data/processed_data"
os.makedirs(fold, exist_ok=True)

data = {
    "X_train": X_train_tensor,
    "X_val": X_val_tensor,
    "X_test": X_test_tensor,
    "y_train": y_train_tensor,
    "y_val": y_val_tensor,
    "y_test": y_test_tensor
}

file_path = os.path.join(fold, "diabetes_processed.pt")
torch.save(data, file_path)

In [None]:
#This how to get the data from the file

data = torch.load("../data/processed_data/diabetes_processed.pt")
X_train = data["X_train"]
y_train = data["y_train"]
X_val = data["X_val"]
y_val = data["y_val"]
X_test = data["X_test"]
y_test = data["y_test"]