In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime
import json
import os
import joblib

# default device name (auto-change when CUDA is available)
device_name = 'cpu'
# Data set name (used to load data and save data name in results)
data_set_name = 'diabetes_prediction_dataset_clean'
# Model name
model_name = 'DecisionTree_v1_log_loss'

# Model hyperparameters
hyperparameters = {
    'random_state': 42,
    'max_depth': None,
    'criterion': 'log_loss'
}

# Checking CUDA availability (not needed for Decision Trees)
device = 'cpu'

# Loading data
data = pd.read_csv(f'./data/{data_set_name}.csv')

# Splitting into features and labels
X = data.drop('diabetes', axis=1)  # features
y = data['diabetes']  # label

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (optional but recommended for Decision Trees)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(**hyperparameters)

# Training start
start_time = datetime.now()
print(f'Training started at {start_time}')

# Training model
dt_model.fit(X_train, y_train)

# Training end
end_time = datetime.now()
print(f'Training finished at {end_time}')
total_time = end_time - start_time
print(f'Training duration: {total_time}')

# Predicting on test set
predictions = dt_model.predict(X_test)

# Evaluating model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

# Classification report
report = classification_report(y_test, predictions, output_dict=True)
print(classification_report(y_test, predictions))

# Saving training results
results = {
    'model_name': model_name,
    'data_set_name': data_set_name,
    'hyperparameters': dt_model.get_params(),
    'training_start_time': start_time.strftime('%Y-%m-%d %H:%M:%S'),
    'training_end_time': end_time.strftime('%Y-%m-%d %H:%M:%S'),
    'total_training_time': str(total_time),
    'accuracy': accuracy,
    'classification_report': report,
    'device': device_name
}

# Function to save results
def save_results(results, model_name, save_dir='results'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    now = datetime.now()
    timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f'{model_name}_{timestamp}.json'
    filepath = os.path.join(save_dir, filename)

    with open(filepath, 'w') as f:
        json.dump(results, f, indent=4)

# Function to save model
def save_model(model, model_name):
    model_dir = './models'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    now = datetime.now()
    timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f'{model_name}_{timestamp}.joblib'
    filepath = os.path.join(model_dir, filename)

    joblib.dump(model, filepath)
    print(f'Model saved successfully as {filename}')

# Saving model
save_results(results, model_name)
save_model(dt_model, model_name)
