In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, learning_curve, validation_curve
import sklearn.model_selection as model_selection

In [None]:
train_data_path = 'data/train.csv'
X = pd.read_csv(train_data_path) 

# What is this
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]

print(X.shape)
print(X.columns)
X.head()

In [None]:
labels_data_path = 'data/labels.csv'
y = pd.read_csv(labels_data_path)
y = y.loc[:, ~y.columns.str.contains('^Unnamed')]

y.head()

In [None]:
test_data_path = 'data/test.csv'
test_data = pd.read_csv(test_data_path) 
test_data.shape


# Preprocessing

In [None]:
#TODO: Preprocessing

# 1) One hot encode variables with string values
# 2) Make correlation matrix to eliminate features with no effect

In [None]:
X_train, X_temp, y_train, y_temp = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = model_selection.train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)


In [None]:
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# import numpy as np

# # Separate numerical and categorical columns
# categorical_features = X_train.select_dtypes(include=['object']).columns
# numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns

# # One-hot encode categorical features
# encoder = OneHotEncoder(handle_unknown='ignore')
# X_train_cat_encoded = encoder.fit_transform(X_train[categorical_features]).toarray()
# X_test_cat_encoded = encoder.transform(X_test[categorical_features]).toarray()

# # Normalize numerical features
# scaler = StandardScaler()
# X_train_num_scaled = scaler.fit_transform(X_train[numerical_features])
# X_test_num_scaled = scaler.transform(X_test[numerical_features])

# # Combine processed categorical and numerical features
# X_train_processed = pd.DataFrame(
#     data=np.hstack((X_train_num_scaled, X_train_cat_encoded)),
#     columns=list(numerical_features) + list(encoder.get_feature_names_out(categorical_features))
# )
# X_test_processed = pd.DataFrame(
#     data=np.hstack((X_test_num_scaled, X_test_cat_encoded)),
#     columns=list(numerical_features) + list(encoder.get_feature_names_out(categorical_features))
# )

# Hyperparam sweep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score

# Preprocess categorical data
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),         # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # Encode categorical features
    ]
)

# Create a pipeline with preprocessing and the model
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(random_state=0))
])

# Define the hyperparameter grid for the Random Forest
param_grid = {
    'rf__n_estimators': [200, 300],  # Prefix with 'rf__' for pipeline
    'rf__max_depth': [20, 30],
    'rf__min_samples_split': [2, 5, 6],
    'rf__min_samples_leaf': [2, 4, 6],
}

# Perform grid search
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, 
                           scoring='f1', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Test the model on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)    

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")


# Model evaluation

In [None]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Test the model on the test set
# y_pred = grid_search.best_estimator_.predict(X_test)
clf = RandomForestClassifier(n_estimators=best_params['rf__n_estimators'], 
                                min_samples_split=best_params['rf__min_samples_split'], 
                                min_samples_leaf=best_params['rf__min_samples_leaf'],
                                max_depth=best_params['rf__max_depth'])

clf.fit(X_train, y_train)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary') # Adjust 'binary' for your task
f1 = f1_score(y_test, y_pred, average='binary')               # Adjust 'binary' for your task

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

In [44]:
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

# # Train the model using the best parameters
# best_rf = RandomForestClassifier(n_estimators=best_params['rf__n_estimators'], 
#                                 min_samples_split=best_params['rf__min_samples_split'], 
#                                 min_samples_leaf=best_params['rf__min_samples_leaf'],
#                                 max_depth=best_params['rf__max_depth'])
# best_rf.fit(X_train, y_train)

# # Evaluate on the test set
# y_pred = best_rf.predict(X_test)

# # Calculate evaluation metrics
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='binary') # Adjust 'binary' for your task
# f1 = f1_score(y_test, y_pred, average='binary')               # Adjust 'binary' for your task

# print(f"Accuracy: {accuracy:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"F1 Score: {f1:.4f}")



# Best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Test the model on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred) # Adjust 'binary' for your task
f1 = f1_score(y_test, y_pred)               # Adjust 'binary' for your task

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Hyperparameters: {'rf__max_depth': 20, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 300}
Accuracy: 0.8640
Precision: 0.5667
F1 Score: 0.1916
