In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

anj_spark=SparkSession.builder\
                  .appName('parbu_data_risk_rating')\
                   .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/05 14:58:28 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/05 14:58:29 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/05 14:58:29 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/12/05 14:58:29 WARN util.Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/12/05 14:58:29 WARN util.Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [2]:
 df = anj_spark.read.parquet('hdfs://localhost:9000/machine_learning/trans_train_source/trans_hist_training')

                                                                                

# Table of Contents
- [Problem Statement](#problem-statement)
- [Analytical Solutions](#analytical-solutions)
- [Data Used](#data-used)
- [EDA](#eda)
- [ML Model](#ml-model)
- [Prediction](#prediction)
- [Model Performance](#model-performance)
- [Final Output](#final-output)
- [Conclusion](#conclusion)

# <a id="problem-statement"></a> Problem Statement
The bank needs to know whether the constumer is risky or not

# <a id="analytical-solutions"></a> Analytical Solutions
Logistic Regression is used to overcome above problem

# <a id="data-used"></a> Data Used

In [6]:
df.show(5)

                                                                                

+---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+-----+---------+
|     acid|month_0|month_1|month_2|month_3|month_4|month_5|month_6|month_7|month_8|month_9|month_10|month_11|month_12|label|eltm_txid|
+---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+--------+-----+---------+
|PB3196939|     93|      0|      0|     10|      0|      0|      0|      0|      0|      0|       0|       0|       0|    1|     5068|
|PB5290461|     36|      0|      9|      3|      6|      2|      3|      3|      2|      0|       0|       0|       0|    1|     5068|
|PB5139315|    158|      0|      0|      0|      0|      0|      0|      0|      0|      0|       0|       0|       0|    1|     5068|
|PB5485912|      2|      1|      1|      0|      0|      0|      0|      0|      0|      0|       0|       0|       0|    1|     5068|
|PB5295373|      1|      1|      0|      0|      0|    

In [4]:
df.columns

['acid',
 'month_0',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'label',
 'eltm_txid']

# <a id="ml-model"></a> ML Model

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType, StringType
# Assemble features
feature_columns = ['month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
                   'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

data = df_assembled.select("acid", "features", "label")

# Split data into training and testing sets
(training_data, test_data) = data.randomSplit([0.7, 0.3], seed=42)

# Train a Random Forest Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
model = rf.fit(training_data)

# Predict on the test set
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)

print("Test Area Under ROC: " + str(auc))

# Assign risk ratings
get_probability = udf(lambda probability: float(probability[1]), DoubleType())
predictions = predictions.withColumn("Risk_Probability", get_probability(col("probability")))

risk_udf = udf(lambda prob: "High" if prob < 0.3 else "Medium" if prob < 0.7 else "Low", StringType())
predictions = predictions.withColumn("Risk_Rating", risk_udf(col("Risk_Probability")))

# Display or save results
data=predictions.select("acid", "Risk_Rating")

data.show()


24/12/05 15:05:29 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Test Area Under ROC: 0.5


[Stage 28:>                                                         (0 + 1) / 1]

+---------+-----------+
|     acid|Risk_Rating|
+---------+-----------+
|PB2384524|        Low|
|PB2384534|        Low|
|PB2384540|        Low|
|PB2384542|        Low|
|PB2384559|        Low|
|PB2384560|        Low|
|PB2384562|        Low|
|PB2384577|        Low|
|PB2384581|        Low|
|PB2384584|        Low|
|PB2384587|        Low|
|PB2384605|        Low|
|PB2384610|        Low|
|PB2384611|        Low|
|PB2384614|        Low|
|PB2384620|        Low|
|PB2384625|        Low|
|PB2384637|        Low|
|PB2384653|        Low|
|PB2384658|        Low|
+---------+-----------+
only showing top 20 rows



                                                                                

In [8]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, udf


feature_columns = ['month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
                   'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

# Select only required columns for modeling
data = df_assembled.select("acid", "features", "label")

# Split data into training and testing sets
(training_data, test_data) = data.randomSplit([0.7, 0.3], seed=42)

# Train a Random Forest Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
model = rf.fit(training_data)

# Predict on the test set
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)

print("Test Area Under ROC: " + str(auc))

# Calculate accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Assign risk ratings
get_probability = udf(lambda probability: float(probability[1]), DoubleType())
predictions = predictions.withColumn("Risk_Probability", get_probability(col("probability")))

risk_udf = udf(lambda prob: "High" if prob < 0.3 else "Medium" if prob < 0.7 else "Low", StringType())
predictions = predictions.withColumn("Risk_Rating", risk_udf(col("Risk_Probability")))

# Display or save results
predictions.select("acid", "Risk_Rating").show()




                                                                                

Test Area Under ROC: 0.5


                                                                                

Accuracy: 99.98%


[Stage 57:>                                                         (0 + 1) / 1]

+---------+-----------+
|     acid|Risk_Rating|
+---------+-----------+
|PB2384524|        Low|
|PB2384534|        Low|
|PB2384540|        Low|
|PB2384542|        Low|
|PB2384559|        Low|
|PB2384560|        Low|
|PB2384562|        Low|
|PB2384577|        Low|
|PB2384581|        Low|
|PB2384584|        Low|
|PB2384587|        Low|
|PB2384605|        Low|
|PB2384610|        Low|
|PB2384611|        Low|
|PB2384614|        Low|
|PB2384620|        Low|
|PB2384625|        Low|
|PB2384637|        Low|
|PB2384653|        Low|
|PB2384658|        Low|
+---------+-----------+
only showing top 20 rows



                                                                                

In [11]:

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col, udf

feature_columns = ['month_0', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6',
                   'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

# Select only required columns for modeling
data = df_assembled.select("acid", "features", "label")

# Split data into training and testing sets
(training_data, test_data) = data.randomSplit([0.7, 0.3], seed=42)

# Train a Random Forest Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
rf_model = rf.fit(training_data)

# Train a Logistic Regression Classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")
lr_model = lr.fit(training_data)




                                                                                

# <a id="prediction"></a> Prediction

In [None]:
[Prediction](#prediction)
- [Model Performance](#model-performance)
- [Final Output](#final-output)
- [Conclusion](#conclusion)

In [13]:
# Make predictions using Random Forest
rf_predictions = rf_model.transform(test_data)

# Make predictions using Logistic Regression
lr_predictions = lr_model.transform(test_data)





# <a id="model-performance"></a> Model Performace

In [None]:
# Evaluate Random Forest model
rf_evaluator = BinaryClassificationEvaluator(labelCol="label")
rf_auc = rf_evaluator.evaluate(rf_predictions)
print("Random Forest Test Area Under ROC: " + str(rf_auc))

# Evaluate Logistic Regression model
lr_evaluator = BinaryClassificationEvaluator(labelCol="label")
lr_auc = lr_evaluator.evaluate(lr_predictions)
print("Logistic Regression Test Area Under ROC: " + str(lr_auc))

# Calculate accuracy for Random Forest
rf_evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
rf_accuracy = rf_evaluator_acc.evaluate(rf_predictions)
print("Random Forest Accuracy: {:.2f}%".format(rf_accuracy * 100))

# Calculate accuracy for Logistic Regression
lr_evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = lr_evaluator_acc.evaluate(lr_predictions)
print("Logistic Regression Accuracy: {:.2f}%".format(lr_accuracy * 100))

In [10]:
from pyspark.sql.functions import col

# Define a function to calculate confusion matrix
def confusion_matrix(predictions):
    # Group by true label and predicted label and count the occurrences
    return predictions.groupBy("label", "prediction").count().show()

# Calculate confusion matrix for Random Forest
print("Random Forest Confusion Matrix:")
confusion_matrix(rf_predictions)

# Calculate confusion matrix for Logistic Regression
print("Logistic Regression Confusion Matrix:")
confusion_matrix(lr_predictions)


Random Forest Confusion Matrix:


                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       1.0|    5|
|    1|       1.0|25185|
+-----+----------+-----+

Logistic Regression Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       1.0|    5|
|    1|       1.0|25185|
+-----+----------+-----+



# <a id="conclusion"></a> Conclusion
Both Random Forest and Logistic Regression models are performing similarly, with an exceptionally high accuracy (99.98%) on the test data. This indicates that the models are making very few errors in their predictions.
However, the confusion matrices reveal that both models are heavily biased towards predicting the majority class (label 1). This suggests that the models may be overfitting or are influenced by an imbalanced dataset, where the majority class (label 1) dominates the predictions.