# First Attempt: August 5th, 2025

## Random Classifier Model

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [4]:
encoded_data = pd.read_csv("../data/encoded_df.csv")

In [5]:
encoded_data.head()

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,0,42,7,25,117,3,-1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,38,514,18,185,1,-1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,36,602,14,111,2,-1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3,27,34,28,10,2,-1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,26,889,3,902,1,-1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Separate features (X) and target (y)
X = encoded_data.drop('y', axis=1)
y = encoded_data['y']

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


Features (X) shape: (750000, 46)
Target (y) shape: (750000,)

X_train shape: (600000, 46)
X_test shape: (150000, 46)
y_train shape: (600000,)
y_test shape: (150000,)


In [7]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
print("\nTraining the Random Forest Classifier...")
model.fit(X_train, y_train)
print("Model training complete.")


Training the Random Forest Classifier...
Model training complete.


In [10]:
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test)
print("Predictions complete.")

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


print("\nTop 10 Feature Importances:")
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances.head(10))


Making predictions on the test set...
Predictions complete.

Model Accuracy: 0.9284

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96    131902
           1       0.76      0.60      0.67     18098

    accuracy                           0.93    150000
   macro avg       0.85      0.79      0.81    150000
weighted avg       0.92      0.93      0.92    150000


Top 10 Feature Importances:
duration            0.490613
balance             0.097815
id                  0.052400
age                 0.050378
day                 0.043892
pdays               0.024953
campaign            0.023956
contact_unknown     0.023686
contact_cellular    0.017787
housing_no          0.017500
dtype: float64
