In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [4]:
data = pd.read_parquet(f'../../data/features/7-mer_remove.parquet')
label_encoder = LabelEncoder()
data["Label"] = label_encoder.fit_transform(data["Target"])

# Split the data into training and test sets
X_train = data[data['Train'] == 0].drop(columns=["Target", "Train", "Label"])
y_train = data[data['Train'] == 0]['Label']
X_val = data[data['Train'] == 1].drop(columns=["Target", "Train", "Label"])
y_val = data[data['Train'] == 1]['Label']

In [11]:
# Define the parameter grid
param_grid = {
    'max_depth': [20, 30, 40, None],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}

results = []

# Iterate over all combinations of parameters in the grid
for params in ParameterGrid(param_grid):
    print(f"Training Random Forest: {params}")
    # Initialize the RandomForestClassifier with current parameters
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, **params)
    
    # Fit the model to the training data
    rf.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = rf.predict(X_val)
    
    # Calculate the performance metrics
    accuracy = accuracy_score(y_val, y_pred)
    micro_f1 = f1_score(y_val, y_pred, average='micro')
    macro_f1 = f1_score(y_val, y_pred, average='macro')
    
    # Append the results as a dictionary to the results list
    results.append({
        'max_depth': params['max_depth'],
        'criterion': params['criterion'],
        'class_weight': params['class_weight'],
        'accuracy': accuracy,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1
    })

    # Optionally, print the results for each combination
    print(f"Params: {params}, Accuracy: {accuracy}, Micro F1: {micro_f1}, Macro F1: {macro_f1}")

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('grid_search_results.csv', index=False)


# Define the path for the CSV file
csv_file_path = 'random_forest_grid_search_results.csv'

# Check if the CSV file already exists
file_exists = os.path.isfile(csv_file_path)

# Use 'a' mode for appending and 'w' mode for writing (which creates a new file if not exists)
mode = 'a' if file_exists else 'w'

# Save the results to the CSV file, appending if the file already exists
results_df.to_csv(csv_file_path, mode=mode, header=not file_exists, index=False)

Training Random Forest: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10}
Params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10}, Accuracy: 0.8898571283818016, Micro F1: 0.8898571283818016, Macro F1: 0.616161554984792
Training Random Forest: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20}
Params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20}, Accuracy: 0.9524774546559935, Micro F1: 0.9524774546559935, Macro F1: 0.8996423960949935
Training Random Forest: {'class_weight': None, 'criterion': 'gini', 'max_depth': 30}
Params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 30}, Accuracy: 0.9637247948120377, Micro F1: 0.9637247948120377, Macro F1: 0.9512074076776604
Training Random Forest: {'class_weight': None, 'criterion': 'gini', 'max_depth': None}
Params: {'class_weight': None, 'criterion': 'gini', 'max_depth': None}, Accuracy: 0.9660553247542811, Micro F1: 0.9660553247542811, Macro F1: 0.9603726066104322
Training Random Fores