In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. Load data
print("Loading data...")
data_path = "dataset/clf_num/jannis.csv"

data = pd.read_csv(data_path)

# 2. Preprocess data
# Assuming the last column is the target variable
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Target

# Convert target to nominal if it's numeric
if np.issubdtype(y.dtype, np.number):
    y = y.astype(int)

print(f"Data loaded: {X.shape[0]} instances, {X.shape[1]} attributes.")

# 3. Parameters
n_experiments = 100
build_times = []
evaluate_times = []
accuracies = []

# 4. Perform 100 randomized experiments
print("\nPerforming 100 randomized experiments...")
for experiment in range(n_experiments):
    print(f"\nExperiment {experiment + 1} of {n_experiments}")

    # 4.1 Randomly split the data into 66% train and 34% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=experiment)

    # 4.2 Create and train Random Forest model
    rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=experiment)

    start_build = time.time()
    rf.fit(X_train, y_train)
    end_build = time.time()
    build_time = end_build - start_build
    build_times.append(build_time)

    # 4.3 Evaluate the model
    start_evaluate = time.time()
    y_pred = rf.predict(X_test)
    end_evaluate = time.time()
    evaluate_time = end_evaluate - start_evaluate
    evaluate_times.append(evaluate_time)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Training time: {build_time:.4f} seconds")
    print(f"Evaluation time: {evaluate_time:.4f} seconds")

# 5. Output Results
print("\nResults of 100 Experiments:")
print(f"Training times (seconds): {', '.join(f'{time:.4f}' for time in build_times)}")
print(f"Evaluation times (seconds): {', '.join(f'{time:.4f}' for time in evaluate_times)}")
print(f"Accuracies (%): {', '.join(f'{accuracy * 100:.2f}' for accuracy in accuracies)}")

# Calculate and display averages
print("\nAverage Training Time: {:.4f} seconds".format(np.mean(build_times)))
print("Average Evaluation Time: {:.4f} seconds".format(np.mean(evaluate_times)))
print("Average Accuracy: {:.2f}%".format(np.mean(accuracies) * 100))


Loading data...
Data loaded: 57580 instances, 54 attributes.

Performing 100 randomized experiments...

Experiment 1 of 100
Accuracy: 78.02%
Training time: 6.6663 seconds
Evaluation time: 0.0899 seconds

Experiment 2 of 100
Accuracy: 78.32%
Training time: 5.0263 seconds
Evaluation time: 0.0885 seconds

Experiment 3 of 100
Accuracy: 78.14%
Training time: 5.1627 seconds
Evaluation time: 0.0671 seconds

Experiment 4 of 100
Accuracy: 78.32%
Training time: 5.2655 seconds
Evaluation time: 0.0863 seconds

Experiment 5 of 100
Accuracy: 78.16%
Training time: 7.1076 seconds
Evaluation time: 0.1678 seconds

Experiment 6 of 100
Accuracy: 78.12%
Training time: 10.2399 seconds
Evaluation time: 0.1610 seconds

Experiment 7 of 100
Accuracy: 78.46%
Training time: 6.4080 seconds
Evaluation time: 0.1093 seconds

Experiment 8 of 100
Accuracy: 78.17%
Training time: 7.7707 seconds
Evaluation time: 0.1800 seconds

Experiment 9 of 100
Accuracy: 78.31%
Training time: 7.3479 seconds
Evaluation time: 0.0830 seco