In [13]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

scaler = StandardScaler()
X = scaler.fit_transform(X)

dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced']  # Add class weight as an option
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(dt, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

grid_search.fit(X, y)

best_dt = grid_search.best_estimator_

y_pred = best_dt.predict(X)

print("Best Parameters for Decision Tree:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)


print("Decision Tree Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best Parameters for Decision Tree: {'class_weight': 'balanced', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Mean Accuracy with Best Parameters: 0.5561561561561561
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.61      0.67       552
           1       0.48      0.76      0.59       561
           2       0.79      0.49      0.61       552

    accuracy                           0.62      1665
   macro avg       0.68      0.62      0.62      1665
weighted avg       0.68      0.62      0.62      1665

Confusion Matrix:
[[335 199  18]
 [ 81 427  53]
 [ 25 255 272]]


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1665 entries, 0 to 1664
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OHQ845           1665 non-null   float64
 1   OCD150           1665 non-null   float64
 2   HUQ010           1665 non-null   float64
 3   RIDAGEYR         1665 non-null   float64
 4   BPAOCSZ          1665 non-null   float64
 5   BPXOSY2          1665 non-null   float64
 6   BMXWT            1665 non-null   float64
 7   BMXBMI           1665 non-null   float64
 8   BMXARMC          1665 non-null   float64
 9   BMXWAIST         1665 non-null   float64
 10  BMXHIP           1665 non-null   float64
 11  BPQ020           1665 non-null   float64
 12  BPQ101D          1665 non-null   float64
 13  RXQ033           1665 non-null   float64
 14  Diabetes Status  1665 non-null   int64  
dtypes: float64(14), int64(1)
memory usage: 195.2 KB


In [15]:
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data_path = '/workspaces/Final-Year-Project/Cleaned Data/TrainTestData.csv'
df = pd.read_csv(data_path)

# Define features and target directly from the dataset
X = df.drop('Diabetes Status', axis=1)
y = df['Diabetes Status']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Set up the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],        # Number of trees in the forest
    'max_depth': [5, 10],               # Maximum depth of the tree
}

# Use GridSearchCV with KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(rf, param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)

# Fit GridSearchCV on the data
grid_search.fit(X, y)

# Get the best model
best_rf = grid_search.best_estimator_

# Evaluate the model with the best parameters
y_pred = best_rf.predict(X)

# Print the best hyperparameters and the mean accuracy from cross-validation
print("Best Parameters for Random Forest:", grid_search.best_params_)
print("Mean Accuracy with Best Parameters:", grid_search.best_score_)

# Classification report and confusion matrix
print("Random Forest Classification Report:")
print(classification_report(y, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters for Random Forest: {'max_depth': 5, 'n_estimators': 100}
Mean Accuracy with Best Parameters: 0.578978978978979
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.71      0.72       552
           1       0.58      0.49      0.53       561
           2       0.66      0.78      0.71       552

    accuracy                           0.66      1665
   macro avg       0.66      0.66      0.66      1665
weighted avg       0.66      0.66      0.65      1665

Confusion Matrix:
[[392 104  56]
 [117 274 170]
 [ 29  91 432]]
