In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle


In [2]:
# Load the synthetic data
df = pd.read_csv('synthetic_bank_customers.csv')
df['Target'].count

<bound method Series.count of 0                         Family
1                         Family
2                       Students
3                 Small business
4                 Small business
                 ...            
995                       Family
996               Small business
997                       Family
998               Small business
999    High Networth Individuals
Name: Target, Length: 1000, dtype: object>

In [3]:
# Encode categorical variables
label_encoders = {}
for column in ['Gender', 'Location', 'Online Behavior', 'Interests', 'Values', 'Lifestyle', 'Target']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Scale the data
scaler = StandardScaler()
df[['Age', 'Transaction History']] = scaler.fit_transform(df[['Age', 'Transaction History']])

# Ensure the scaler is fit on all features
feature_columns = ['Age', 'Transaction History', 'Gender', 'Location', 'Online Behavior', 'Interests', 'Values', 'Lifestyle']
scaler.fit(df[feature_columns])

# Split the data
X = df.drop(columns=['Customer ID', 'Name', 'Target'])
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Update the scaler to fit on the entire training set features
scaler.fit(X_train[feature_columns])

In [4]:
# Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
# Predict on the test set
y_pred = clf.predict(X_test)

In [5]:
# Save the model and the encoders
with open('model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

with open('label_encoders.pkl', 'wb') as le_file:
    pickle.dump(label_encoders, le_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


# Load the model and other necessary objects
with open('model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('label_encoders.pkl', 'rb') as le_file:
    label_encoders = pickle.load(le_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Check the label encoders' unique values
for column, le in label_encoders.items():
    print(f"{column} encoder classes: {le.classes_}")

Gender encoder classes: ['Female' 'Male' 'Non-binary']
Location encoder classes: ['Adelaide' 'Brisbane' 'Hobart' 'Melbourne' 'Perth' 'Sydney' '`']
Online Behavior encoder classes: ['Active' 'Passive']
Interests encoder classes: ['Fashion' 'Finance' 'Music' 'Sports' 'Tech' 'Travel']
Values encoder classes: ['Convenience' 'Customer Service' 'Innovation' 'Low Fees' 'Security']
Lifestyle encoder classes: ['Business Owner' 'Family-Oriented' 'Retired' 'Single' 'Student']
Target encoder classes: ['Family' 'Frequent traveller' 'High Networth Individuals' 'Single'
 'Small business' 'Students' 'Tech savvy customer' 'Young clients']


In [6]:
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

               Feature  Importance
7            Lifestyle    0.324273
3  Transaction History    0.198172
0                  Age    0.157112
5            Interests    0.096772
2             Location    0.076815
6               Values    0.066834
1               Gender    0.049778
4      Online Behavior    0.030244


In [7]:
# Removed Gender, Values & online behavior based on feature importance
# Encode categorical variables 
label_encoders = {}
for column in ['Location',  'Interests', 'Lifestyle', 'Target']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Scale the data
scaler = StandardScaler()
df[['Age', 'Transaction History']] = scaler.fit_transform(df[['Age', 'Transaction History']])

In [8]:

# Ensure the scaler is fit on all features
feature_columns = ['Age', 'Transaction History', 'Location', 'Interests', 'Lifestyle']
scaler.fit(df[feature_columns])

# Split the data
X = df.drop(columns=['Customer ID', 'Name', 'Target'])
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Update the scaler to fit on the entire training set features
scaler.fit(X_train[feature_columns])

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, error_score='raise')
grid_search.fit(X_train, y_train)

# Best estimator
best_clf = grid_search.best_estimator_


In [10]:
# Predict on the test set using the best estimator
y_pred = best_clf.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.75      0.84        59
           1       0.67      0.50      0.57        12
           2       0.86      0.90      0.88        49
           3       0.49      0.74      0.59        23
           4       0.58      0.88      0.70        33
           5       1.00      0.75      0.86         4
           6       0.60      0.21      0.32        14
           7       1.00      0.17      0.29         6

    accuracy                           0.73       200
   macro avg       0.77      0.61      0.63       200
weighted avg       0.78      0.73      0.73       200



In [11]:
## Can try other techniques Cross-Validation with Stratified K-Fold
## Ensure the training and validation sets maintain the same distribution of target classes.
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [12]:
## Ensemble Methods Combine the predictions of multiple models.
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Define the ensemble estimators
estimators = [
    ('rf', best_clf),
    ('xgb', XGBClassifier(random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42))
]

# Initialize and train the StackingClassifier
stack_clf = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier(random_state=42))
stack_clf.fit(X_train, y_train)
y_pred = stack_clf.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 8
[LightGBM] [Info] Start training from score -1.108663
[LightGBM] [Info] Start training from score -2.421932
[LightGBM] [Info] Start training from score -1.566618
[LightGBM] [Info] Start training from score -2.021173
[LightGBM] [Info] Start training from score -1.939680
[LightGBM] [Info] Start training from score -3.688879
[LightGBM] [Info] Start training from score -3.188104
[LightGBM] [Info] Start training from score -3.506558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number

In [None]:
## pickle the model with best estimator has accuracy of 73% as the above Ensemble Methods not converge 
## 'Age', 'Transaction History', 'Location', 'Interests', 'Lifestyle']
# Encode categorical variables

label_encoders_new = {}
for column in ['Location', 'Interests', 'Lifestyle', 'Target']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders_new[column] = le

# Prepare features for scaling
features_for_scaling = df[['Age', 'Transaction History', 'Location', 'Interests', 'Lifestyle']]

# Scale the data
scaler_new = StandardScaler()
scaled_features = scaler_new.fit_transform(features_for_scaling)

# Replace scaled values in the DataFrame
df[['Age', 'Transaction History', 'Location', 'Interests', 'Lifestyle']] = scaled_features

# Splitting into features and target
X = df[['Age', 'Transaction History', 'Location', 'Interests', 'Lifestyle']]
y = df['Target']

# Fit a RandomForestClassifier (Example)
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, error_score='raise')
grid_search.fit(X_train, y_train)

# Best estimator
best_clf = grid_search.best_estimator_

best_clf.fit(X, y)

# Save the model and the encoders
with open('model_new.pkl', 'wb') as model_file:
    pickle.dump(best_clf, model_file)
with open('label_encoders_new.pkl', 'wb') as le_file:
    pickle.dump(label_encoders_new, le_file)
with open('scaler_new.pkl', 'wb') as scaler_file:
    pickle.dump(scaler_new, scaler_file)

# Load the new model and other necessary objects
with open('model_new.pkl', 'rb') as model_file:
    model_new = pickle.load(model_file)
with open('label_encoders_new.pkl', 'rb') as le_file:
    label_encoders_new = pickle.load(le_file)
with open('scaler_new.pkl', 'rb') as scaler_file:
    scaler_new = pickle.load(scaler_file)


