In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

# Load the categorical, numerical, and target CSV files
categorical_data = pd.read_csv("categorical.csv")
numerical_data = pd.read_csv("numerical.csv")
target_data = pd.read_csv("target.csv")

# One-hot encode categorical variables
encoder = OneHotEncoder()
categorical_data_encoded = pd.DataFrame(encoder.fit_transform(categorical_data).toarray())

# Combine numerical and encoded categorical data
data = pd.concat([numerical_data, categorical_data_encoded, target_data], axis=1)

# Split the data into features and target variable
X = data.drop("TARGET_B", axis=1)
y = data["TARGET_B"]

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Upscale the data using SMOTE
smote = SMOTE(random_state=42)
X_train_upsampled, y_train_upsampled = smote.fit_resample(X_train, y_train)

# Fit and evaluate Random Forest classifier model
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_upsampled, y_train_upsampled)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest classifier model:", accuracy_rf)


Accuracy of Random Forest classifier model: 0.9941833045118692
