In [4]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=4703a0406578afaebb401a1a884f4b90a43c9ed788857c958c333db92f005b4c
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from xgboost.spark import SparkXGBClassifier
from xgboost.spark import SparkXGBClassifierModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
import time
import tensorflow as tf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [7]:
spark = SparkSession.builder \
    .appName("XGBoost Bank Loan") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.driver.maxResultSize", "8g") \
    .getOrCreate()

  self.pid = _posixsubprocess.fork_exec(


# **Read Train/Test Data**

In [8]:
train = spark.read.parquet("/content/drive/MyDrive/Bank Project/final_final_delinq_2yrs_data", header=True, inferSchema=True)

In [9]:
test = spark.read.parquet("/content/drive/MyDrive/Bank Project/final_delinq_2yrs_test", header=True, inferSchema=True)

In [10]:
train.groupBy("delinq_2yrs").count().show()

+-----------+-------+
|delinq_2yrs|  count|
+-----------+-------+
|        0.0|1924467|
|        1.0|1641275|
|        4.0|1909161|
|        3.0|1882620|
|        2.0|1846547|
+-----------+-------+



# **Get Feature vector of each data**

In [11]:
feature_columns_train = train.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns_train, outputCol="features")
train = assembler.transform(train)

feature_columns_test = test.columns[:-1]
assembler_test = VectorAssembler(inputCols=feature_columns_test, outputCol="features")
test = assembler_test.transform(test)

# **Convert train data into proper format of keras**

In [12]:
train_0 = train.filter(col("delinq_2yrs") == 0).limit(50000)
train_1 = train.filter(col("delinq_2yrs") == 1).limit(50000)
train_2 = train.filter(col("delinq_2yrs") == 2).limit(50000)
train_3 = train.filter(col("delinq_2yrs") == 3).limit(50000)
train_4 = train.filter(col("delinq_2yrs") == 4).limit(50000)

In [13]:
train = train_0.union(train_1).union(train_2).union(train_3).union(train_4)

In [14]:
del train_0, train_1, train_2, train_3, train_4

In [15]:
X = train.select("features")
y = train.select("delinq_2yrs")

In [16]:
del train

In [17]:
X_pd = X.toPandas()
y_pd = y.toPandas()

In [18]:
del X, y

In [19]:
import numpy as np
X_np = np.array(X_pd["features"].tolist())
y_np = np.array(y_pd).flatten()

# **Convert test data into proper format of keras**

In [20]:
X_test = test.select("features")
y_test = test.select("delinq_2yrs")

In [21]:
X_test_pd = X_test.toPandas()
y_test_pd = y_test.toPandas()

In [22]:
X_test_np = np.array(X_test_pd["features"].tolist())
y_test_np = np.array(y_test_pd).flatten()

# **Build model**

In [25]:
from tensorflow.keras import layers, models
import keras

teacher_model = models.Sequential([
    layers.Input(shape=(X_np.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(5, activation='softmax')
])

spars_categorical_accuracy = keras.metrics.SparseCategoricalAccuracy(
    name="sparse_categorical_accuracy", dtype=None
)
teacher_model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=[spars_categorical_accuracy])

teacher_model.fit(X_np, y_np, epochs=20, batch_size=32, validation_data=(X_test_np, y_test_np))

Epoch 1/20
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 12ms/step - loss: 1.2446 - sparse_categorical_accuracy: 0.4478 - val_loss: 1.2156 - val_sparse_categorical_accuracy: 0.4739
Epoch 2/20
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 12ms/step - loss: 0.5468 - sparse_categorical_accuracy: 0.7652 - val_loss: 1.2324 - val_sparse_categorical_accuracy: 0.5022
Epoch 3/20
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 12ms/step - loss: 0.3862 - sparse_categorical_accuracy: 0.8297 - val_loss: 1.3920 - val_sparse_categorical_accuracy: 0.4341
Epoch 4/20
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 12ms/step - loss: 0.3227 - sparse_categorical_accuracy: 0.8574 - val_loss: 1.1464 - val_sparse_categorical_accuracy: 0.6192
Epoch 5/20
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 11ms/step - loss: 0.2798 - sparse_categorical_accuracy: 0.8784 - val_loss: 1.2450 - val_sparse_catego

<keras.src.callbacks.history.History at 0x7d5763ea4cd0>

In [26]:
soft_labels = teacher_model.predict(X_np)

[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step


In [27]:
student_model = models.Sequential([
    layers.Input(shape=(X_np.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(5, activation='softmax')
])


In [28]:
import tensorflow as tf
from tensorflow.keras import layers, models

def knowledge_distillation_loss(y_true, y_pred, soft_labels, temperature=2.0):
    y_true = tf.keras.utils.to_categorical(y_true, num_classes=5)  # Assuming 5 classes
    y_pred_soft = tf.keras.backend.softmax(y_pred / temperature)

    batch_size = tf.shape(y_true)[0]
    soft_labels_batch = tf.gather(soft_labels, tf.range(batch_size))
    soft_labels_soft = tf.keras.backend.softmax(soft_labels_batch / temperature)

    return tf.keras.backend.categorical_crossentropy(y_true, y_pred) + \
           tf.keras.backend.categorical_crossentropy(soft_labels_soft, y_pred_soft)

student_model.compile(optimizer='adam',
                      loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, soft_labels),
                      metrics=[spars_categorical_accuracy])

student_model.fit(X_np, y_np, epochs=30, batch_size=32, validation_data=(X_test_np, y_test_np))

Epoch 1/30
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 6ms/step - loss: 3.0395 - sparse_categorical_accuracy: 0.6358 - val_loss: 2.8696 - val_sparse_categorical_accuracy: 0.5102
Epoch 2/30
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 7ms/step - loss: 2.7291 - sparse_categorical_accuracy: 0.5271 - val_loss: 2.8926 - val_sparse_categorical_accuracy: 0.5072
Epoch 3/30
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 5ms/step - loss: 2.6178 - sparse_categorical_accuracy: 0.5720 - val_loss: 2.9878 - val_sparse_categorical_accuracy: 0.4364
Epoch 4/30
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 7ms/step - loss: 2.5590 - sparse_categorical_accuracy: 0.5980 - val_loss: 2.8375 - val_sparse_categorical_accuracy: 0.5244
Epoch 5/30
[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 7ms/step - loss: 2.5141 - sparse_categorical_accuracy: 0.6176 - val_loss: 2.8915 - val_sparse_categorical_ac

<keras.src.callbacks.history.History at 0x7d576317a710>

In [30]:
student_model.save("/content/drive/MyDrive/Bank Project/student_model.keras")


  return {key: serialize_keras_object(value) for key, value in obj.items()}


In [31]:
teacher_model.save("/content/drive/MyDrive/Bank Project/teacher_model.keras")

In [32]:
student_model.evaluate(X_test_np, y_test_np)

[1m18317/18317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2ms/step - loss: 3.0836 - sparse_categorical_accuracy: 0.4832


[3.0863757133483887, 0.48137304186820984]