In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Load your datasets
pidd_data = pd.read_csv('diabetes.csv')
frankfurt_data = pd.read_csv('diabetes_2.csv')

# Combine datasets for processing
datasets = {
    "PIDD": pidd_data,
    "Frankfurt": frankfurt_data
}

# Imputation methods
imputation_methods = ['no_imputation', 'mean', 'median', 'knn', 'random_sample', 'mice']
scalers = ['standard', 'minmax']
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

In [3]:
def impute_data(method, X):
    
    # Identify columns with 0 as missing values, excluding 'Pregnancies'
    cols_with_zeros = X.columns[(X == 0).any()]
    cols_with_zeros = cols_with_zeros.drop('Pregnancies')
    
    # Count the number of 0s before imputation
    # print(f"Imputation method: {method}")
    # print("Columns to impute (0 values):", list(cols_with_zeros))
    # print("Before imputation, number of 0s in columns to impute:")
    # print(X[cols_with_zeros].apply(lambda col: (col == 0).sum()))
    
    # Convert 0 values to NaN for imputation
    X_imputed = X.copy()
    X_imputed[cols_with_zeros] = X_imputed[cols_with_zeros].replace(0, np.nan)
    
    if method == 'mean':
        imputer = SimpleImputer(strategy='mean')
        X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
    elif method == 'median':
        imputer = SimpleImputer(strategy='median')
        X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
    elif method == 'knn':
        imputer = KNNImputer(n_neighbors=5)
        X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
    elif method == 'random_sample':
        for column in cols_with_zeros:
            population = X_imputed[column][X_imputed[column] != 0]
            if not population.empty:
                X_imputed.loc[X_imputed[column].isna(), column] = random.choices(population.tolist(), k=X_imputed[column].isna().sum())
            else:
                mean_value = X_imputed[column].mean()
                X_imputed.loc[X_imputed[column].isna(), column] = mean_value
    elif method == 'mice':
        imputer = IterativeImputer(max_iter=10, random_state=42)
        X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
    
    # Replace NaN back to 0 (optional, based on your need)
    X_imputed[cols_with_zeros] = X_imputed[cols_with_zeros].fillna(0)
    
    # Count the number of 0s after imputation
    # print("After imputation, number of 0s in columns to impute:")
    # print(X_imputed[cols_with_zeros].apply(lambda col: (col == 0).sum()))
    
    return X_imputed


In [4]:
# # Prepare results storage
# results_list = []

# for dataset_name, dataset in datasets.items():
#     X = dataset.drop('Outcome', axis=1)
#     y = dataset['Outcome']

#     for imputation_method in imputation_methods:
#         # Apply imputation
#         X_imputed = impute_data(imputation_method, X)

#         for scaling_method in scalers:
#             # Apply scaling
#             if scaling_method == 'standard':
#                 scaler = StandardScaler()
#                 X_scaled = scaler.fit_transform(X_imputed)
#             elif scaling_method == 'minmax':
#                 scaler = MinMaxScaler()
#                 X_scaled = scaler.fit_transform(X_imputed)

#             for model_name, model in models.items():
#                 try:
#                     # Train-test split (70% training, 30% testing)
#                     X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

#                     # Fit the model
#                     model.fit(X_train, y_train)

#                     # Make predictions
#                     y_pred = model.predict(X_test)

#                     # Compute metrics
#                     accuracy = accuracy_score(y_test, y_pred)
#                     precision = precision_score(y_test, y_pred, zero_division=0)
#                     recall = recall_score(y_test, y_pred, zero_division=0)
#                     f1 = f1_score(y_test, y_pred, zero_division=0)

#                     # Store the results
#                     results_list.append({
#                         'Dataset': dataset_name,
#                         'Imputation': imputation_method,
#                         'Scaling': scaling_method,
#                         'Model': model_name,
#                         'Accuracy': accuracy,
#                         'Precision': precision,
#                         'Recall': recall,
#                         'F1-score': f1
#                     })
#                 except Exception as e:
#                     print(f"Error processing {dataset_name} with {imputation_method}, {scaling_method}, {model_name}: {e}")



In [8]:
# Define parameter grids for each model

# models = {
#     'Logistic Regression': LogisticRegression(),
#     'SVM': SVC(C=0.85, kernel='rbf', gamma=1),
#     'KNN': KNeighborsClassifier(n_neighbors=4,weights='distance',metric='euclidean'),
#     'Decision Tree': DecisionTreeClassifier(criterion='gini',max_depth=6),
#     'Random Forest': RandomForestClassifier(criterion='gini',max_depth=4,n_estimators=50)
# }

param_grids = {
    'Logistic Regression': {
        # 'C': [0.1, 1],
        'solver': ['lbfgs', 'liblinear']
    },
    'SVM': {
        'C': [0.7, 0.8, 0.9],
        'kernel': ['linear', 'rbf'],
        'gamma': [0.5, 0.65,0.80 ,1]
    },
    'KNN': {
        'n_neighbors': [4, 5, 6],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [4, 5, 6],
        'min_samples_split': [5, 6, 7]
    },
    'Random Forest': {
        'n_estimators': [50, 75],
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 4, 5, 6],
    }
}

# Prepare results storage
results_list = []

for dataset_name, dataset in datasets.items():
    X = dataset.drop('Outcome', axis=1)
    y = dataset['Outcome']

    for imputation_method in imputation_methods:
        # Apply imputation
        X_imputed = impute_data(imputation_method, X)

        for scaling_method in scalers:
            # Apply scaling
            if scaling_method == 'standard':
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(X_imputed)
            elif scaling_method == 'minmax':
                scaler = MinMaxScaler()
                X_scaled = scaler.fit_transform(X_imputed)

            for model_name, model in models.items():
                try:
                    # Train-test split (70% training, 30% testing)
                    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

                    # GridSearchCV for hyperparameter tuning
                    grid = GridSearchCV(model, param_grids[model_name], scoring='f1', cv=3)
                    #  grid = GridSearchCV(model, param_grids[model_name], scoring='accuracy', cv=5, n_jobs=-1)
                    grid.fit(X_train, y_train)
                    best_model = grid.best_estimator_

                    # Make predictions with the best model
                    y_pred = best_model.predict(X_test)

                    # Compute metrics
                    accuracy = accuracy_score(y_test, y_pred)
                    precision = precision_score(y_test, y_pred, zero_division=0)
                    recall = recall_score(y_test, y_pred, zero_division=0)
                    f1 = f1_score(y_test, y_pred, zero_division=0)

                    # Store the results
                    results_list.append({
                        'Dataset': dataset_name,
                        'Imputation': imputation_method,
                        'Scaling': scaling_method,
                        'Model': model_name,
                        'Best Params': grid.best_params_,
                        'Accuracy': accuracy,
                        'Precision': precision,
                        'Recall': recall,
                        'F1-score': f1
                    })

                except Exception as e:
                    print(f"Error processing {dataset_name} with {imputation_method}, {scaling_method}, {model_name}: {e}")

# Convert results to DataFrame
results_df = pd.DataFrame(results_list)

results_df


Unnamed: 0,Dataset,Imputation,Scaling,Model,Best Params,Accuracy,Precision,Recall,F1-score
0,PIDD,no_imputation,standard,Logistic Regression,{'solver': 'liblinear'},0.740260,0.626506,0.641975,0.634146
1,PIDD,no_imputation,standard,SVM,"{'C': 0.7, 'gamma': 0.5, 'kernel': 'linear'}",0.753247,0.650000,0.641975,0.645963
2,PIDD,no_imputation,standard,KNN,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.709957,0.592105,0.555556,0.573248
3,PIDD,no_imputation,standard,Decision Tree,"{'criterion': 'entropy', 'max_depth': 4, 'min_...",0.731602,0.600000,0.703704,0.647727
4,PIDD,no_imputation,standard,Random Forest,"{'criterion': 'entropy', 'max_depth': 6, 'n_es...",0.774892,0.698630,0.629630,0.662338
...,...,...,...,...,...,...,...,...,...
115,Frankfurt,mice,minmax,Logistic Regression,{'solver': 'lbfgs'},0.791667,0.765432,0.587678,0.664879
116,Frankfurt,mice,minmax,SVM,"{'C': 0.9, 'gamma': 1, 'kernel': 'rbf'}",0.798333,0.771084,0.606635,0.679045
117,Frankfurt,mice,minmax,KNN,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.971667,0.940909,0.981043,0.960557
118,Frankfurt,mice,minmax,Decision Tree,"{'criterion': 'gini', 'max_depth': 6, 'min_sam...",0.850000,0.764192,0.829384,0.795455


In [6]:
# # Convert results to DataFrame
# results_df = pd.DataFrame(results_list)

# results_df

In [10]:
# Assuming df is your DataFrame
# results_df.to_csv('result_final_02.csv', index=False)

In [8]:
# X = pidd_data.iloc[:,:9]
# X_imputed = impute_data('mean', X)
# X_imputed

In [9]:
# def impute_data(method, X):
#     # Identify columns with 0 as missing values
#     cols_with_zeros = X.columns[(X == 0).any().drop(column='Pregnancies')]
#     X_imputed = X.copy()
    
#     for column in cols_with_zeros:
#         missing_mask = X_imputed[column] == 0
#         if method == 'mean':
#             imputer = SimpleImputer(strategy='mean')
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'median':
#             imputer = SimpleImputer(strategy='median')
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'knn':
#             imputer = KNNImputer(n_neighbors=5)
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'random_sample':
#             population = X_imputed[column][X_imputed[column] != 0]
#             if not population.empty:
#                 X_imputed.loc[missing_mask, column] = random.choices(population.tolist(), k=missing_mask.sum())
#             else:
#                 mean_value = X_imputed[column].mean()
#                 X_imputed.loc[missing_mask, column] = mean_value
#         elif method == 'mice':
#             imputer = IterativeImputer(max_iter=10, random_state=42)
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
    
#     return X_imputed

# def impute_data(method, X):
#     # Identify columns with 0 as missing values, excluding 'Pregnancies'
#     cols_with_zeros = X.columns[(X == 0).any()]
#     cols_with_zeros = cols_with_zeros.drop('Pregnancies')  # Exclude 'Pregnancies'
    
#     X_imputed = X.copy()
    
#     for column in cols_with_zeros:
#         missing_mask = X_imputed[column] == 0
#         if method == 'mean':
#             imputer = SimpleImputer(strategy='mean')
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'median':
#             imputer = SimpleImputer(strategy='median')
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'knn':
#             imputer = KNNImputer(n_neighbors=5)
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'random_sample':
#             population = X_imputed[column][X_imputed[column] != 0]
#             if not population.empty:
#                 X_imputed.loc[missing_mask, column] = random.choices(population.tolist(), k=missing_mask.sum())
#             else:
#                 mean_value = X_imputed[column].mean()
#                 X_imputed.loc[missing_mask, column] = mean_value
#         elif method == 'mice':
#             imputer = IterativeImputer(max_iter=10, random_state=42)
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
    
#     return X_imputed

# def impute_data(method, X):
#     # Identify columns with 0 as missing values, excluding 'Pregnancies'
#     cols_with_zeros = X.columns[(X == 0).any()]
#     cols_with_zeros = cols_with_zeros.drop('Pregnancies')  # Exclude 'Pregnancies'

#     X_imputed = X.copy()
    
#     # Print initial state for reference
#     print(f"Imputation method: {method}")
#     print(f"Columns to impute (0 values): {cols_with_zeros.tolist()}")
#     print(f"Before imputation, number of 0s in columns to impute:\n{(X_imputed[cols_with_zeros] == 0).sum()}")

#     for column in cols_with_zeros:
#         missing_mask = X_imputed[column] == 0
        
#         if method == 'mean':
#             imputer = SimpleImputer(strategy='mean')
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'median':
#             imputer = SimpleImputer(strategy='median')
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'knn':
#             imputer = KNNImputer(n_neighbors=5)
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()
#         elif method == 'random_sample':
#             population = X_imputed[column][X_imputed[column] != 0]
#             if not population.empty:
#                 X_imputed.loc[missing_mask, column] = random.choices(population.tolist(), k=missing_mask.sum())
#             else:
#                 mean_value = X_imputed[column].mean()
#                 X_imputed.loc[missing_mask, column] = mean_value
#         elif method == 'mice':
#             imputer = IterativeImputer(max_iter=10, random_state=42)
#             X_imputed[column] = imputer.fit_transform(X_imputed[column].values.reshape(-1, 1)).ravel()

#     # Print the state after imputation for comparison
#     print(f"After imputation, number of 0s in columns to impute:\n{(X_imputed[cols_with_zeros] == 0).sum()}\n")
    
#     return X_imputed

# def impute_data(method, X):
#     # Identify columns with 0 as missing values
#     cols_with_zeros = X.columns[(X == 0).any()]
#     cols_with_zeros = cols_with_zeros.drop('Pregnancies')  # Exclude 'Pregnancies'
    
#     # Convert 0 values to NaN for imputation
#     X_imputed = X.copy()
#     X_imputed[cols_with_zeros] = X_imputed[cols_with_zeros].replace(0, np.nan)
    
#     if method == 'mean':
#         imputer = SimpleImputer(strategy='mean')
#         X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
#     elif method == 'median':
#         imputer = SimpleImputer(strategy='median')
#         X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
#     elif method == 'knn':
#         imputer = KNNImputer(n_neighbors=5)
#         X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
#     elif method == 'random_sample':
#         for column in cols_with_zeros:
#             population = X_imputed[column][X_imputed[column] != 0]
#             if not population.empty:
#                 X_imputed.loc[X_imputed[column].isna(), column] = random.choices(population.tolist(), k=X_imputed[column].isna().sum())
#             else:
#                 mean_value = X_imputed[column].mean()
#                 X_imputed.loc[X_imputed[column].isna(), column] = mean_value
#     elif method == 'mice':
#         imputer = IterativeImputer(max_iter=10, random_state=42)
#         X_imputed[cols_with_zeros] = imputer.fit_transform(X_imputed[cols_with_zeros])
    
#     # Replace NaN back to 0 (if you want to keep consistency in the data)
#     X_imputed[cols_with_zeros] = X_imputed[cols_with_zeros].fillna(0)
    
#     return X_imputed