In [1]:
import findspark
findspark.init()
findspark.find()

'H:\\SPARK'

In [2]:
from pyspark.sql import SparkSession

# Initialize SparkSession with necessary configurations
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Spark') \
    .config("spark.driver.memory", "15g") \
    .config("spark.hadoop.home.dir", "H:/HADOOP/") \
    .config("spark.hadoop.conf.dir", "H:/HADOOP/etc/hadoop/") \
    .getOrCreate()
    
import sys
sys.path.append("G:\Dissertation_Project")

# Get SparkContext from the SparkSession
sc = spark.sparkContext


In [3]:
spark

### BASE DATASET

In [4]:
base_df = spark.read.csv("../../Data/Custom_Datasets/conversation_datasets_GPT.csv", header=True, inferSchema=True)
base_df.show(10, truncate=False)

+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|Conversation_ID|Attacker_Helper                                                                                                                                                 |Victim                                                                                                                                                                                         |Conversation_Type|
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------

### PREPROCESSED DATASET

In [5]:
preprocessed_df = spark.read.csv("../../Data/Preprocessed_Datasets/GPT_dataset_preprocessed.csv", header=True, inferSchema=True)
preprocessed_df.show(10, truncate=False)

+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Convert Conversation Columns into actual Arrays

In [6]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import ast

# UDF to convert string representation of list to actual list
def str_to_array_of_arrays(s):
    # Convert the string to a list and then wrap it inside another list
    return [ast.literal_eval(s)][0]

str_to_array_of_arrays_udf = udf(str_to_array_of_arrays, ArrayType(ArrayType(StringType())))

df = preprocessed_df.withColumn("Attacker_Helper", str_to_array_of_arrays_udf(preprocessed_df["Attacker_Helper"])).withColumn("Victim", str_to_array_of_arrays_udf(preprocessed_df["Victim"]))

df.printSchema()

root
 |-- Conversation_ID: string (nullable = true)
 |-- Attacker_Helper: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Victim: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Conversation_Type: integer (nullable = true)



### Loading the pipeline and transforming the data

In [7]:
from pyspark.ml import PipelineModel
from src.CustonTransformers import FlattenTransformer 

pipeline_model_path = "./Pipelines/TF-IDF_Pipeline"

pipeline = PipelineModel.load(path=pipeline_model_path)

df_assembled = pipeline.transform(df)

### Splitting the Data

In [8]:
(train_data, test_data) = df_assembled.randomSplit([0.8, 0.2], seed=42)
train_data.show(truncate=False)

+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## GRADIENT BOOSTED TREES

### Training Gradient Boosted Trees Model

In [15]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Create GBT classifier
gbt = GBTClassifier(labelCol="Conversation_Type", featuresCol="combined_features", predictionCol="Prediction")

evaluator = MulticlassClassificationEvaluator(labelCol='Conversation_Type', metricName="accuracy", predictionCol="Prediction")

# Hyperparameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 4, 6, 10]) \
    .addGrid(gbt.stepSize, [0.001, 0.01, 0.1, 0.3]) \
    .addGrid(gbt.maxIter, [20, 50, 70, 100]) \
    .build()

# Cross validation
tvs = TrainValidationSplit(
    estimator=gbt,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(labelCol="Conversation_Type"),
    parallelism=2,
    seed=42 
)

# Train model using TrainValidationSplit
tvsModel = tvs.fit(train_data)

print(tvsModel, '\n')
print(tvsModel.explainParams(), '\n')
print('\nEvaluating with metric -> {}'.format(evaluator.getMetricName()))
print('Evaluators rating for the test set -> {}'.format(evaluator.evaluate(tvsModel.transform(test_data))))

TrainValidationSplitModel_a032b24d6c75 

estimator: estimator to be cross-validated (current: GBTClassifier_961b4751de9c)
estimatorParamMaps: estimator param maps (current: [{Param(parent='GBTClassifier_961b4751de9c', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2, Param(parent='GBTClassifier_961b4751de9c', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.001, Param(parent='GBTClassifier_961b4751de9c', name='maxIter', doc='max number of iterations (>= 0).'): 20}, {Param(parent='GBTClassifier_961b4751de9c', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2, Param(parent='GBTClassifier_961b4751de9c', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for s

In [16]:
print('Train Validation Split Model validationMetrics -> {}'.format(tvsModel.validationMetrics))
print('Best Model -> {}'.format(tvsModel.bestModel))

Train Validation Split Model validationMetrics -> [0.8584656084656085, 0.8597883597883598, 0.8624338624338624, 0.8571428571428572, 0.8478835978835979, 0.8597883597883598, 0.8624338624338624, 0.8703703703703703, 0.8835978835978836, 0.9126984126984128, 0.9259259259259259, 0.933862433862434, 0.9179894179894179, 0.9285714285714286, 0.9391534391534391, 0.9417989417989419, 0.9021164021164021, 0.8862433862433863, 0.8835978835978836, 0.8835978835978836, 0.8862433862433863, 0.8902116402116402, 0.8902116402116402, 0.9007936507936507, 0.9074074074074074, 0.9074074074074074, 0.9047619047619048, 0.9047619047619048, 0.9179894179894179, 0.9153439153439153, 0.9153439153439153, 0.9074074074074074, 0.8134920634920635, 0.8134920634920635, 0.8134920634920635, 0.8134920634920635, 0.8134920634920635, 0.8187830687830688, 0.8161375661375663, 0.8253968253968255, 0.802910052910053, 0.7962962962962963, 0.843915343915344, 0.794973544973545, 0.8227513227513228, 0.8492063492063492, 0.8492063492063492, 0.84391534391

### Testing

In [17]:
predictions = tvsModel.bestModel.transform(test_data)
predictions.select("Conversation_ID","Conversation_Type", "Prediction", "Probability").where(predictions.Prediction != predictions.Conversation_Type).show(10, truncate=False)

+---------------+-----------------+----------+----------------------------------------+
|Conversation_ID|Conversation_Type|Prediction|Probability                             |
+---------------+-----------------+----------+----------------------------------------+
|GtvAWBM9Prc6wo |0                |1.0       |[0.06304171814689524,0.9369582818531048]|
|VOwbnpYBzqoBgJ |1                |0.0       |[0.711637202104431,0.28836279789556896] |
|Yaf8GGeBAPDBer |0                |1.0       |[0.0893577108370067,0.9106422891629933] |
+---------------+-----------------+----------+----------------------------------------+



In [18]:
model_path = "../Models/Trained_Models/GradientBoostedTrees"

In [19]:
tvsModel.write().overwrite().save(model_path)