In [11]:
import warnings

warnings.filterwarnings('ignore')

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import time
start_time = time.time()

# Load the data
data = pd.read_csv('https://github.com/craft-mini/ML_Finance_Group3/raw/main/GroupProjectDataSet.csv', sep=',')

# Define the features and the target variable
X = data.drop(columns=['Class'])
y = data['Class']

# Define preprocessing steps for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)])

# Define the Random Forest Classifier model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(n_jobs=-1, class_weight='balanced'))])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameters to tune
param_grid = {
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__max_depth': [None, 5, 10, 15, 20],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 6],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__bootstrap': [True, False]
}

# Tune hyperparameters using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Predict on the testing data
y_pred = best_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

Best Hyperparameters: {'classifier__bootstrap': False, 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
              precision    recall  f1-score   support

         0.0       0.94      0.76      0.84        21
         1.0       0.94      0.96      0.95       194
         2.0       0.83      0.84      0.84        58
         3.0       0.53      0.62      0.57        13
         4.0       0.67      0.33      0.44         6

    accuracy                           0.90       292
   macro avg       0.78      0.70      0.73       292
weighted avg       0.90      0.90      0.90       292

Elapsed time: 5799.51 seconds
