In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd

In [3]:
# Initialize feature and target datasets
X = pd.read_csv(r"features.csv")
y = pd.read_csv(r"target.csv")

In [4]:
# Split data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)

# Train demo model to get feature importances
demo_model = RandomForestClassifier(random_state=42)
demo_model.fit(X_train, y_train)
feature_importances = demo_model.feature_importances_

# Map importances to their features
feature_importance_df = pd.DataFrame({
    "Feature" : X.columns,
    'Importance' : feature_importances
})

feature_importance_df.head()

  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,Feature,Importance
0,age,0.078876
1,marital,0.013463
2,credit_on_default,0.001496
3,annual_balance,0.080912
4,housing_loan,0.025059


In [6]:
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)  # Sort features by most important

selected_features = list(feature_importance_df['Feature'][0:5])  # Put selected features into list

print(selected_features)

feature_importance_df.head()

['contact_duration', 'annual_balance', 'age', 'contact_day', 'poutcome']


Unnamed: 0,Feature,Importance
7,contact_duration,0.352638
3,annual_balance,0.080912
0,age,0.078876
6,contact_day,0.070257
11,poutcome,0.044352


In [18]:
# Filter out irrelevant features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

X_train.head()

Unnamed: 0,age,contact_month_dec,job_unknown,education_secondary,education_tertiary
8,37,False,False,True,False
3,55,False,False,True,False
6,56,False,False,False,True
41,48,False,False,False,True
46,43,False,False,True,False


In [19]:
# Scale X data for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Assert that the scaled data has the same shape as the original values
assert(X_train.shape == X_train_scaled.shape)
assert(X_test.shape == X_test_scaled.shape)

In [22]:
param_grid = {
    'n_estimators': [100, 200, 300],          # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],          # Depth of each tree
    'min_samples_split': [2, 5, 10],          # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],         # Number of features to consider for best split
    'bootstrap': [True, False],               # Whether to use bootstrap samples
}

rf_model = RandomForestClassifier(random_state=42)  # Intialize model

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv = 2,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train_scaled[:50], y_train[50])  # Fit the model with first 50 records for computational reasons

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Save the model
best_model = grid_search.best_estimator_
joblib.dump(best_model, r"../myapp/ml_models/rf_model.pkl")

Fitting 2 folds for each of 432 candidates, totalling 864 fits
Best parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 1.0


  return fit_method(estimator, *args, **kwargs)


['../myapp/ml_models/rf_model.pkl']