In [1]:
# Libs
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime
import json
import os
import joblib

<h1 style='color: gold'>
    Variables
</h1>

In [2]:
# default device name (auto-change when CUDA is available)
device_name = 'cpu'
# Data set name (used to load data and save data name in results)
data_set_name = 'diabetes_prediction_dataset_clean_normalized'
# Model name
model_name = 'Random_forest_v1_normalized'

# Model hyperparameters
hyperparameters = {
    'n_estimators': 100,
    'random_state': 42
}

<h1 style='color: gold'>
    Data Preparation
</h1>

In [3]:
# Checking CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if str(device) == 'cuda': device_name = torch.cuda.get_device_name(0)

# Loading data
data = pd.read_csv(f'./data/{data_set_name}.csv')

# Splitting into features and labels
X = data.drop('diabetes', axis=1)  # features
y = data['diabetes']  # label

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (optional but recommended for Random Forest)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Moving data to GPU
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

<h1 style='color: gold'>
    Functions for Saving Model and Training History
</h1>

In [4]:
# Function to save results
def save_results(results, model_name, save_dir='results'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    now = datetime.now()
    timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f'{model_name}_{timestamp}.json'
    filepath = os.path.join(save_dir, filename)

    with open(filepath, 'w') as f:
        json.dump(results, f, indent=4)

# Function to save model
def save_model(model, model_name):
    model_dir = './models'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    now = datetime.now()
    timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f'{model_name}_{timestamp}.joblib'
    filepath = os.path.join(model_dir, filename)

    joblib.dump(model, filepath)
    print(f'Model saved successfully as {filename}')

<h1 style='color: gold'>
    Random Forest Model Implementation
</h1>

In [5]:
# Training start
start_time = datetime.now()
print(f'Training started at {start_time}')

# Initializing Random Forest model in scikit-learn
rf_model = RandomForestClassifier(**hyperparameters)

# Training model on GPU
rf_model.fit(X_train, y_train)

# Training end
end_time = datetime.now()
print(f'Training finished at {end_time}')
total_time = end_time - start_time
print(f'Training duration: {total_time}')

# Predicting on test set
predictions = rf_model.predict(X_test)

# Evaluating model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

# Classification report
report = classification_report(y_test, predictions, output_dict=True)
print(classification_report(y_test, predictions))

Training started at 2024-06-14 18:05:41.264663
Training finished at 2024-06-14 18:05:44.343189
Training duration: 0:00:03.078526
Accuracy: 0.9603130682267373
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     11285
           1       0.94      0.68      0.79      1364

    accuracy                           0.96     12649
   macro avg       0.95      0.84      0.88     12649
weighted avg       0.96      0.96      0.96     12649



<h1 style='color: gold'>
    Saving Results and Model
</h1>

In [6]:
# Saving training results
results = {
    'model_name': model_name,
    'data_set_name': data_set_name,
    'hyperparameters': hyperparameters,
    'training_start_time': start_time.strftime('%Y-%m-%d %H:%M:%S'),
    'training_end_time': end_time.strftime('%Y-%m-%d %H:%M:%S'),
    'total_training_time': str(total_time),
    'accuracy': accuracy,
    'classification_report': report,
    'device': device_name
}

# Saving model
save_results(results, model_name)
save_model(rf_model, model_name)

Model saved successfully as Random_forest_v1_normalized_2024-06-14_18-05-44.joblib
