In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6f06ce8d9b0df035ec7f16deabd5efd627f873bd40b117c35f568e675656a9f2
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from xgboost.spark import SparkXGBClassifier
from xgboost.spark import SparkXGBClassifierModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
import time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder \
    .appName("XGBoost Bank Loan") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.driver.maxResultSize", "8g") \
    .getOrCreate()

# **Read train and test data.**

In [None]:
train = spark.read.parquet("/content/drive/MyDrive/Bank Project/final_acc_now_delinq_data", header=True, inferSchema=True)

In [None]:
test = spark.read.parquet("/content/drive/MyDrive/Bank Project/final_acc_now_delinq_test_", header=True, inferSchema=True)

# **Create vector columns for train and test features.**

In [None]:
feature_columns_train = train.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns_train, outputCol="features")
train = assembler.transform(train)

In [None]:
feature_columns_test = test.columns[:-1]
assembler_test = VectorAssembler(inputCols=feature_columns_test, outputCol="features")
test = assembler_test.transform(test)

# **Build XGBoost Model**

In [None]:
xgb = SparkXGBClassifier(
    features_col="features",
    label_col="acc_now_delinq",
    prediction_col="prediction",
    seed=42,
    subsample=0.9,
    reg_alpha=0.5,
    gamma=0.2,
    eval_metric='aucpr',
    num_workers=2,
    verbosity=1,
    max_depth = 3
)

### Hyperparameters' tuning.

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(xgb.n_estimators, [100]) \
    .addGrid(xgb.learning_rate, [0.01, 0.1]) \
    .build()

### Create evaluator.

In [None]:
evaluator = BinaryClassificationEvaluator(
    labelCol="acc_now_delinq",
    rawPredictionCol="prediction",
    metricName="areaUnderPR"
)

### Crossvalidation with 3 folds.

In [None]:
cv = CrossValidator(estimator=xgb, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)

In [None]:
cv_model = cv.fit(test)

INFO:XGBoost-PySpark:Running xgboost-2.1.1 on 2 workers with
	booster params: {'device': 'cpu', 'eval_metric': 'aucpr', 'gamma': 0.2, 'learning_rate': 0.01, 'max_depth': 3, 'objective': 'binary:logistic', 'reg_alpha': 0.5, 'subsample': 0.9, 'verbosity': 1, 'seed': 42, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.1 on 2 workers with
	booster params: {'device': 'cpu', 'eval_metric': 'aucpr', 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'reg_alpha': 0.5, 'subsample': 0.9, 'verbosity': 1, 'seed': 42, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!
INFO:XGBoost-PySpark:Running xgboost-2.1.1 on 2 workers with
	booster params: {'device': 'cpu',

### Model evaluation.

In [None]:
predictions = cv_model.transform(test)

In [None]:
def classification_report(predictions):
    # Calculate metrics for class 0
    tp_0 = predictions.filter((col("acc_now_delinq") == 0) & (col("prediction") == 0)).count()
    fp_0 = predictions.filter((col("acc_now_delinq") == 1) & (col("prediction") == 0)).count()
    fn_0 = predictions.filter((col("acc_now_delinq") == 0) & (col("prediction") == 1)).count()
    tn_0 = predictions.filter((col("acc_now_delinq") == 1) & (col("prediction") == 1)).count()

    precision_0 = tp_0 / (tp_0 + fp_0) if (tp_0 + fp_0) > 0 else 0
    recall_0 = tp_0 / (tp_0 + fn_0) if (tp_0 + fn_0) > 0 else 0
    f1_score_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0
    support_0 = tp_0 + fn_0

    # Calculate metrics for class 1
    tp_1 = predictions.filter((col("acc_now_delinq") == 1) & (col("prediction") == 1)).count()
    fp_1 = predictions.filter((col("acc_now_delinq") == 0) & (col("prediction") == 1)).count()
    fn_1 = predictions.filter((col("acc_now_delinq") == 1) & (col("prediction") == 0)).count()
    tn_1 = predictions.filter((col("acc_now_delinq") == 0) & (col("prediction") == 0)).count()

    precision_1 = tp_1 / (tp_1 + fp_1) if (tp_1 + fp_1) > 0 else 0
    recall_1 = tp_1 / (tp_1 + fn_1) if (tp_1 + fn_1) > 0 else 0
    f1_score_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0
    support_1 = tp_1 + fn_1

    # Print classification report
    print("Classification Report:")
    print(f"Class 0:")
    print(f"  Precision: {precision_0:.4f}")
    print(f"  Recall: {recall_0:.4f}")
    print(f"  F1 Score: {f1_score_0:.4f}")
    print(f"  Support: {support_0}")

    print(f"Class 1:")
    print(f"  Precision: {precision_1:.4f}")
    print(f"  Recall: {recall_1:.4f}")
    print(f"  F1 Score: {f1_score_1:.4f}")
    print(f"  Support: {support_1}")


    roc_auc = evaluator.evaluate(predictions)
    print(f"ROC AUC: {roc_auc:.4f}")

In [None]:
classification_report(predictions)

Classification Report:
Class 0:
  Precision: 0.8300
  Recall: 0.9598
  F1 Score: 0.8902
  Support: 444341
Class 1:
  Precision: 0.6203
  Recall: 0.2505
  F1 Score: 0.3569
  Support: 116509
ROC AUC: 0.4657


# **Build Neural Network Model**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from pyspark.ml.functions import vector_to_array

In [None]:
from re import X
train = train.sample(fraction=0.3)
X = train.select("features")
y = train.select("acc_now_delinq")

In [None]:
X_pd = X.toPandas()
y_pd = y.toPandas()

In [None]:
X_np = np.array(X_pd["features"].tolist())
y_np = np.array(y_pd).flatten()

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=X_np.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
adam = Adam(
    learning_rate=0.001,
    epsilon=1e-07
)

model.compile(
    optimizer=adam,
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
             tf.keras.metrics.AUC(name='auc'),
             tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall')]
)

# Set up early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
history = model.fit(
    X_np, y_np,
    epochs=5,
    batch_size=32,
    validation_split=0,
    callbacks=[early_stopping]
)

Epoch 1/5
[1m21835/21835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 3ms/step - accuracy: 0.7719 - auc: 0.8519 - loss: 0.4697 - precision: 0.7555 - recall: 0.8027
Epoch 2/5


  current = self.get_monitor_value(logs)


[1m21835/21835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 3ms/step - accuracy: 0.8242 - auc: 0.9027 - loss: 0.3869 - precision: 0.8045 - recall: 0.8561
Epoch 3/5
[1m21835/21835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 3ms/step - accuracy: 0.8376 - auc: 0.9137 - loss: 0.3627 - precision: 0.8128 - recall: 0.8757
Epoch 4/5
[1m21835/21835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 3ms/step - accuracy: 0.8463 - auc: 0.9205 - loss: 0.3471 - precision: 0.8195 - recall: 0.8874
Epoch 5/5
[1m21835/21835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 3ms/step - accuracy: 0.8509 - auc: 0.9245 - loss: 0.3371 - precision: 0.8227 - recall: 0.8945


In [None]:
X_test = test.select("features")
y_test = test.select("acc_now_delinq")

In [None]:
X_test_pd = X_test.toPandas()
y_test_pd = y_test.toPandas()

In [None]:
X_test_np = np.array(X_test_pd["features"].tolist())
y_test_np = np.array(y_test_pd).flatten()

# **Evaluate Model**

In [None]:
model.evaluate(X_test_np, y_test_np)

[1m17527/17527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 2ms/step - accuracy: 0.8002 - auc: 0.7870 - loss: 0.8767 - precision: 0.5140 - recall: 0.6110


[1.1666195392608643,
 0.7640260457992554,
 0.7195919156074524,
 0.4427328407764435,
 0.5254358053207397]