<a href="https://colab.research.google.com/github/KY-2000/AutomatedMLPipeline/blob/main/AutomatedPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm
!pip install tensorboardX
!pip install bayesian-optimization

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2
Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [None]:
import csv
import pandas as pd
import lightgbm as lgb
from functools import partial
from tensorboardX import SummaryWriter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization, UtilityFunction
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Initialize an empty DataFrame to store cumulative data
cumulative_df = pd.DataFrame()

# Initialize an empty array to store metrics data
metrics_data = {'accuracy': []}

# Initialize a SummaryWriter object to create a Tensorboard summary file
writer = SummaryWriter()

# Define the parameter search space
param_ranges = {
    'num_leaves': (10, 3000),
    'max_depth': (3, 20),
    'min_data_in_leaf': (10, 500),
    'feature_fraction': (0.0001, 1.0),
    'bagging_fraction': (0.0001, 1.0),
    'learning_rate': (0.00001, 0.5)
}

# Define the initial date range
start_date = pd.to_datetime('2014-12-03')
end_date = pd.to_datetime('2016-11-30')

global_step = 0  # Initialize global step

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/loan.csv', encoding='latin1', quoting=csv.QUOTE_NONE)
df.columns = df.columns.str.replace('"', '')
df['applicationDate'] = df['applicationDate'].str.slice(1, 11)
# print(df['applicationDate'])
df['applicationDate'] = pd.to_datetime(df['applicationDate'])
# .dt.date

# Keep track of whether all original DataFrame data has been used
all_data_used = False

"""
def adjust_param_ranges(param_ranges, best_params, metrics_data, top_percentile=0.9):

    Adjusts the parameter ranges for LightGBM based on the previous tuning results
    and the model's performance.

    Args:
        param_ranges (dict): A dictionary containing the current parameter ranges.
        best_params (dict): The best parameters found in the previous tuning iteration.
        score (float): The current model's performance score.
        prev_score (float): The previous model's performance score.
        top_percentile (float): The percentile to consider for adjusting the ranges.

    Returns:
        dict: The updated parameter ranges.

    updated_ranges = param_ranges.copy()

    # Determine if the model's performance improved or deteriorated
    performance_improved = metrics_data['accuracy'][-1] > metrics_data['accuracy'][-2]

    for param, (lower, upper) in param_ranges.items():
        best_value = best_params[param]

        # Calculate the percentile range around the best value
        percentile_range = np.percentile(np.array([lower, best_value, upper]), [1 - top_percentile, top_percentile])

        if performance_improved:
            # Narrow down the range around the best value
            updated_ranges[param] = percentile_range
        else:
            # Widen the range and explore new regions
            lower_bound = min(lower, best_value - (upper - best_value))
            upper_bound = max(upper, best_value + (best_value - lower))
            updated_ranges[param] = (lower_bound, upper_bound)

    return updated_ranges
"""

def lgb_cv(num_leaves, max_depth, min_data_in_leaf, feature_fraction, bagging_fraction, learning_rate, X_train, X_test, y_train, y_test):
    model = lgb.LGBMClassifier(
        num_leaves=int(num_leaves),
        max_depth=int(max_depth),
        min_data_in_leaf=int(min_data_in_leaf),
        feature_fraction=max(min(feature_fraction, 1), 0),
        bagging_fraction=max(min(bagging_fraction, 1), 0),
        learning_rate=learning_rate,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def optimize_lgb(X_train, X_test, y_train, y_test):
    lgb_cv_partial = partial(lgb_cv, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    optimizer = BayesianOptimization(
        f=lgb_cv_partial,
        pbounds={
            'num_leaves': (10, 30000),
            'max_depth': (3, 30),
            'min_data_in_leaf': (10, 5000),
            'feature_fraction': (0.00001, 1.0),
            'bagging_fraction': (0.00001, 1.0),
            'learning_rate': (0.00001, 0.5)
        },
        random_state=42,
        verbose=2
    )
    optimizer.set_gp_params(alpha=1e-3, n_restarts_optimizer=5)
    optimizer.maximize(
        init_points=5,
        n_iter=25,
    )
    print("Best parameters: ", optimizer.max['params'])
    print("Maximum accuracy: ", optimizer.max['target'])
    return optimizer.max['params']

while not all_data_used:
  # Filter the DataFrame based on the initial date range
  filtered_df = df[(df['applicationDate'] >= start_date) & (df['applicationDate'] <= end_date)]

  # Concatenate filtered_df with cumulative_df
  cumulative_df = pd.concat([cumulative_df, filtered_df])

  if len(cumulative_df) == len(df):
      all_data_used = True

  # Split the data into features (X) and target variable (y)
  # print(df.columns)
  X = cumulative_df.drop(columns=['loanStatus', 'loanId'])
  # print(X.columns)
  y = cumulative_df['loanStatus']

  # Encode categorical columns
  categorical_columns = ['anon_ssn', 'payFrequency', 'applicationDate', 'originated', 'originatedDate', 'approved', 'state', 'leadType', 'fpStatus', 'clarityFraudId']
  for col in categorical_columns:
      le = LabelEncoder()
      # col = col.replace('.', '\.')
      X[col] = le.fit_transform(X[col])

  # Split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Adjust parameter ranges based on previous tuning results
  best_params = optimize_lgb(X_train, X_test, y_train, y_test)
  print(best_params)
  best_params = {
    key: int(value) if key in ["max_depth", "min_data_in_leaf", "num_leaves"] else value
    for key, value in best_params.items()
  }

  # Build LightGBM model
  model = lgb.LGBMClassifier(**best_params)
  model.fit(X_train, y_train)

  # Step 5: Evaluation
  # Predictions on test set
  y_pred = model.predict(X_test)

  # Calculate accuracy
  accuracy = accuracy_score(y_test, y_pred)
  metrics_data['accuracy'] = accuracy
  print("\nAccuracy:", accuracy)

  # Classification report
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  # Confusion matrix
  print("\nConfusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  for metric_name, metric_value in metrics_data.items():
      writer.add_scalar(metric_name, metric_value, global_step)
  global_step += 1

  # Update start_date and end_date for the next iteration
  start_date = end_date + pd.DateOffset(days=1)
  end_date = start_date + pd.offsets.MonthEnd()

# Close the SummaryWriter
writer.close()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| [0m29       [0m | [0m0.7921   [0m | [0m0.06551  [0m | [0m0.3181   [0m | [0m0.2165   [0m | [0m29.96    [0m | [0m330.9    [0m | [0m2.089e+04[0m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 313187, number of used features: 17
[LightGBM] [Info] Start training from score -7.207818
[LightGBM] [Info] Start training from score -6.145787
[LightGBM] [Info] Start training from score -7.927168
[LightGBM] [Info] Start training from score -7.019766
[LightGBM] [Info] Start training from score -7.229606
[LightGBM] [Info] Start training from score -12.654556
[LightGBM] [Info] Start training from score -3.681078
[LightGBM] [Info] Start training from sc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| [0m29       [0m | [0m0.5633   [0m | [0m0.06551  [0m | [0m0.3181   [0m | [0m0.2165   [0m | [0m29.96    [0m | [0m330.9    [0m | [0m2.089e+04[0m |
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1543
[LightGBM] [Info] Number of data points in the train set: 356640, number of used features: 17
[LightGBM] [Info] Start training from score -7.142575
[LightGBM] [Info] Start training from score -6.298321
[LightGBM] [Info] Start training from score -8.013798
[LightGBM] [Info] Start training from score -6.778129
[LightGBM] [Info] Start training from score -7.111159
[LightGBM] [Info] Start training from score -12.784482
[LightGBM] [Info] Start training from score -3.688543
[LightGBM] [Info] Start training from score -5.215586
[LightGBM] [Info] Start training from score -5.081

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| [0m29       [0m | [0m0.7471   [0m | [0m0.06551  [0m | [0m0.3181   [0m | [0m0.2165   [0m | [0m29.96    [0m | [0m330.9    [0m | [0m2.089e+04[0m |
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 409194, number of used features: 17
[LightGBM] [Info] Start training from score -7.201633
[LightGBM] [Info] Start training from score -6.388156
[LightGBM] [Info] Start training from score -8.039143
[LightGBM] [Info] Start training from score -6.744001
[LightGBM] [Info] Start training from score -7.107814
[LightGBM] [Info] Start training from score -3.802076
[LightGBM] [Info] Start training from score -4.878281
[LightGBM] [Info] Start training from score -4.672892
[LightGBM] [Info] Start training from score -3.8588

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| [0m29       [0m | [0m0.7648   [0m | [0m0.06551  [0m | [0m0.3181   [0m | [0m0.2165   [0m | [0m29.96    [0m | [0m330.9    [0m | [0m2.089e+04[0m |
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.145291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 440354, number of used features: 17
[LightGBM] [Info] Start training from score -7.239592
[LightGBM] [Info] Start training from score -6.381950
[LightGBM] [Info] Start training from score -8.112532
[LightGBM] [Info] Start training from score -12.995334
[LightGBM] [Info] Start training from score -6.782728
[LightGBM] [Info] Start training from score -7.071078
[LightGBM] [Info] Start training from score -3.882056
[LightGBM] [Info] Start training from score -4.715130
[LightGBM] [Info] Start training from score -4.430

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| [0m29       [0m | [0m0.6826   [0m | [0m0.06551  [0m | [0m0.3181   [0m | [0m0.2165   [0m | [0m29.96    [0m | [0m330.9    [0m | [0m2.089e+04[0m |
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 462145, number of used features: 17
[LightGBM] [Info] Start training from score -7.256737
[LightGBM] [Info] Start training from score -6.364035
[LightGBM] [Info] Start training from score -8.160832
[LightGBM] [Info] Start training from score -13.043634
[LightGBM] [Info] Start training from score -6.744685
[LightGBM] [Info] Start training from score -7.077487
[LightGBM] [Info] Start training from score -3.933335
[LightGBM] [Info] Start training from sc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                               precision    recall  f1-score   support

                           ""       0.68      0.35      0.46        65
        "CSR Voided New Loan"       0.38      0.01      0.03       230
       "Charged Off Paid Off"       0.00      0.00      0.00        28
         "Credit Return Void"       0.71      0.81      0.76       160
   "Customer Voided New Loan"       0.43      0.18      0.26       114
  "Customver Voided New Loan"       0.00      0.00      0.00         1
        "External Collection"       0.64      0.62      0.63      2286
        "Internal Collection"       0.57      0.34      0.43      1134
                   "New Loan"       0.52      0.85      0.64      1581
              "Paid Off Loan"       0.59      0.58      0.58      2307
    "Pending Application Fee"       0.00      0.00      0.00         0
        "Pending Application"       0.00      0.00      0.00         1
           "Pending Paid Off"       0.00      0.00      0.00        32
     

In [None]:
tensorboard --logdir=runs