In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Initialize feature and target datasets
X = pd.read_csv(r"features.csv")
y = pd.read_csv(r"target.csv")

In [3]:
sample_size = 50  # Define sample size
# Merge X and Y to create full dataset
df = X
df['deposit'] = y

# Randomly select 50 records
sample_df, _ = train_test_split(
    df,
    train_size=sample_size,
    stratify=df['deposit'],
    random_state=42
)

sample_df.shape  # Print shape to ensure that the sampling is correct

(50, 40)

In [4]:
sample_df['deposit'].value_counts(normalize=True)

deposit
0    0.52
1    0.48
Name: proportion, dtype: float64

In [5]:
# Define new X and y variables
X = sample_df.drop('deposit', axis=1)
y = sample_df['deposit']

# Split data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)

# Train demo model to get feature importances
demo_model = RandomForestClassifier(random_state=42)
demo_model.fit(X_train, y_train)
feature_importances = demo_model.feature_importances_

# Map importances to their features
feature_importance_df = pd.DataFrame({
    "Feature" : X.columns,
    'Importance' : feature_importances
})

feature_importance_df.head()

Unnamed: 0,Feature,Importance
0,age,0.104708
1,marital,0.037838
2,credit_on_default,0.0
3,annual_balance,0.077423
4,housing_loan,0.097818


In [6]:
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)  # Sort features by most important

selected_features = list(feature_importance_df['Feature'][0:5])  # Put selected features into list

print(selected_features)

feature_importance_df.head()

['contact_duration', 'age', 'housing_loan', 'annual_balance', 'contact_day']


Unnamed: 0,Feature,Importance
7,contact_duration,0.212132
0,age,0.104708
4,housing_loan,0.097818
3,annual_balance,0.077423
6,contact_day,0.072956


In [7]:
# Filter out irrelevant features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

X_train.head()

Unnamed: 0,contact_duration,age,housing_loan,annual_balance,contact_day
423,829,30,0,5,10
1697,439,26,0,3511,30
9108,91,38,1,-397,21
3405,1093,30,0,3779,13
7387,322,46,1,-29,12


In [8]:
# Scale X data for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assert that the scaled data has the same shape as the original values
assert(X_train.shape == X_train_scaled.shape)
assert(X_test.shape == X_test_scaled.shape)

In [9]:
param_grid = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],          # Depth of each tree
    'min_samples_split': [2, 5, 10],          # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],         # Number of features to consider for best split
    'bootstrap': [True, False],               # Whether to use bootstrap samples
}

rf_model = RandomForestClassifier(random_state=42)  # Intialize model

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv = 2,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train_scaled, y_train)  # Fit the model with first 50 records for computational reasons

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Save the model and scaler
best_model = grid_search.best_estimator_
joblib.dump(best_model, r"../myapp/ml_models/rf_model.pkl")
joblib.dump(scaler, r"../myapp/ml_models/scaler.pkl")

Fitting 2 folds for each of 432 candidates, totalling 864 fits
Best parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.7309941520467836


['../myapp/ml_models/scaler.pkl']

In [11]:
y_pred = best_model.predict(X_test_scaled)  # Predict the test values
accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy

# Print the accuracy and classification report
print(f"The model's accuracy is: {round(accuracy, 2)}")
print()
print(classification_report(y_test, y_pred))


The model's accuracy is: 0.54

              precision    recall  f1-score   support

           0       0.56      0.71      0.62         7
           1       0.50      0.33      0.40         6

    accuracy                           0.54        13
   macro avg       0.53      0.52      0.51        13
weighted avg       0.53      0.54      0.52        13

