In [59]:
import findspark
findspark.init()
findspark.find()

import pyspark
findspark.find()

'c:\\Users\\johns\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pyspark'

In [60]:
#Create SparkSession
from pyspark import SparkContext
from pyspark.sql import SparkSession
  
spark = SparkSession.builder.master("local[*]").appName('Spark') \
.config("spark.driver.memory", "15g")\
.getOrCreate()


sc = spark.sparkContext 

In [61]:
spark

### BASE DATASET

In [62]:
base_df = spark.read.csv("../../Data/Custom_Datasets/conversation_datasets_GPT.csv", header=True, inferSchema=True)
base_df.show(10, truncate=False)

+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|Conversation_ID|Attacker_Helper                                                                                                                                                 |Victim                                                                                                                                                                                         |Conversation_Type|
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------

### PREPROCESSED DATASET

In [63]:
preprocessed_df = spark.read.csv("../../Data/Preprocessed_Datasets/GPT_dataset_preprocessed.csv", header=True, inferSchema=True)
preprocessed_df.show(10, truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Convert Conversation Columns into actual Arrays

In [64]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import ast

# UDF to convert string representation of list to actual list
def str_to_array_of_arrays(s):
    # Convert the string to a list and then wrap it inside another list
    return [ast.literal_eval(s)][0]

str_to_array_of_arrays_udf = udf(str_to_array_of_arrays, ArrayType(ArrayType(StringType())))

df = preprocessed_df.withColumn("Attacker_Helper", str_to_array_of_arrays_udf(preprocessed_df["Attacker_Helper"])).withColumn("Victim", str_to_array_of_arrays_udf(preprocessed_df["Victim"]))

df.printSchema()

root
 |-- Conversation_ID: string (nullable = true)
 |-- Attacker_Helper: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Victim: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Conversation_Type: integer (nullable = true)



### FLATTEN

In [65]:
from pyspark.sql.functions import flatten

df = df.withColumn("Attacker_Helper", flatten(df["Attacker_Helper"]))
df = df.withColumn("Victim", flatten(df["Victim"]))

df.show(10, truncate=False)


+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### CONVERTING INTO TF VECTORS
#### Each flattened row in the dataframe is converted to a vector. HashingTF uses the hashing trick. A potential drawback is that multiple words might hash to the same feature index, causing collisions. 

In [66]:
from pyspark.ml.feature import HashingTF

# For Attacker_Helper column
hashingTF_ah = HashingTF(inputCol="Attacker_Helper", outputCol="AH_features", numFeatures= (2^16)) # numFeatures specifies how many features (hash buckets) 2^16 num features is provided here, increase if deemed necessary
df = hashingTF_ah.transform(df)

# For Victim column
hashingTF_v = HashingTF(inputCol="Victim", outputCol="V_features", numFeatures= (2^16))
df = hashingTF_v.transform(df)

In [67]:
df.show(10, truncate=True)

+---------------+--------------------+--------------------+-----------------+--------------------+--------------------+
|Conversation_ID|     Attacker_Helper|              Victim|Conversation_Type|         AH_features|          V_features|
+---------------+--------------------+--------------------+-----------------+--------------------+--------------------+
|     GT1sURbxgG|[hi, john, ir, ow...|[im, sorri, dont,...|                1|(19,[2,3,4,5,6,7,...|(19,[1,2,3,4,5,6,...|
|     TwaGOeC96w|[hello, xyz, bank...|[victim, name, he...|                0|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|
|     V73ZDCviQL|[hello, sir, call...|[ive, never, issu...|                1|(19,[0,1,2,3,4,5,...|(19,[0,1,3,4,5,6,...|
|     wNsWdbpeld|[hello, john, ban...|[sure, pleas, con...|                0|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|
| x0pSxAQx1K0abm|[hello, ir, call,...|[oh, idea, give, ...|                1|(19,[0,1,2,3,4,5,...|(19,[1,2,3,4,6,7,...|
| I8QHkmsq5AdwAq|[hello, xyz, bank...|[j

### TF-IDF
#### IDF (Inverse Document Frequency): IDF is a measure of how important a term is. While CountVectorizer counts how many times a term appears in a document (Term Frequency or TF), the IDF looks at how often a term appears across all documents.

In [68]:
from pyspark.ml.feature import IDF

# For Attacker_Helper features
idf_ah = IDF(inputCol="AH_features", outputCol="AH_tfidf_features")
df = idf_ah.fit(df).transform(df)

# For Victim features
idf_v = IDF(inputCol="V_features", outputCol="V_tfidf_features")
df = idf_v.fit(df).transform(df)


df.show(10, truncate=True)

+---------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|Conversation_ID|     Attacker_Helper|              Victim|Conversation_Type|         AH_features|          V_features|   AH_tfidf_features|    V_tfidf_features|
+---------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|     GT1sURbxgG|[hi, john, ir, ow...|[im, sorri, dont,...|                1|(19,[2,3,4,5,6,7,...|(19,[1,2,3,4,5,6,...|(19,[2,3,4,5,6,7,...|(19,[1,2,3,4,5,6,...|
|     TwaGOeC96w|[hello, xyz, bank...|[victim, name, he...|                0|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|
|     V73ZDCviQL|[hello, sir, call...|[ive, never, issu...|                1|(19,[0,1,2,3,4,5,...|(19,[0,1,3,4,5,6,...|(19,[0,1,2,3,4,5,...|(19,[0,1,3,4,5,6,...|
|     wNsWdbpeld|[hello, joh

#### New dataframe containing only the conversation type and the tfidf features for Attacker_Helper and Victim columns.

In [69]:
df = df.select("Conversation_ID", "Conversation_Type","AH_tfidf_features", "V_tfidf_features")

df.show(10, truncate=False)

+---------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Conversation_ID|Conversation_Type|AH_tfidf_features                                                                                                 

### ASSEMBLING THE TWO INPUT VECTORS INTO ONE

In [70]:
from pyspark.ml.feature import VectorAssembler


# 1. Feature Assembly
assembler = VectorAssembler(
    inputCols=["AH_tfidf_features", "V_tfidf_features"],
    outputCol="combined_features")

df_assembled = assembler.transform(df)

df_assembled.show(10, truncate=True)

+---------------+-----------------+--------------------+--------------------+--------------------+
|Conversation_ID|Conversation_Type|   AH_tfidf_features|    V_tfidf_features|   combined_features|
+---------------+-----------------+--------------------+--------------------+--------------------+
|     GT1sURbxgG|                1|(19,[2,3,4,5,6,7,...|(19,[1,2,3,4,5,6,...|[0.0,0.0,0.233261...|
|     TwaGOeC96w|                0|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|[0.73465896136168...|
|     V73ZDCviQL|                1|(19,[0,1,2,3,4,5,...|(19,[0,1,3,4,5,6,...|[0.55099422102126...|
|     wNsWdbpeld|                0|(19,[0,1,2,3,5,6,...|(19,[0,1,2,3,5,6,...|[0.91832370170211...|
| x0pSxAQx1K0abm|                1|(19,[0,1,2,3,4,5,...|(19,[1,2,3,4,6,7,...|[0.18366474034042...|
| I8QHkmsq5AdwAq|                0|(19,[0,1,2,3,4,5,...|(19,[0,1,3,4,6,7,...|[1.28565318238295...|
| v9vN0BYTWa0e16|                1|(19,[1,3,4,6,7,10...|(19,[0,1,2,6,7,8,...|(38,[1,3,4,6,7,10...|
| pVeYjXj2

### Splitting the Data

In [87]:
(train_data, test_data) = df_assembled.randomSplit([0.8, 0.2], seed=42)
train_data.show(truncate=False)

+---------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# LOGISTIC REGRESSION

### Train the Logistic Regression Model

In [72]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lr = LogisticRegression(featuresCol="combined_features", labelCol="Conversation_Type", predictionCol="Prediction", maxIter=50)

evaluator = MulticlassClassificationEvaluator(labelCol='Conversation_Type', metricName="accuracy", predictionCol="Prediction")

paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [10, 100]) \
    .addGrid(lr.regParam, [0.01, 0.1, 1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()


# This is similar to cross validation but only split the set once
tvs = TrainValidationSplit(
    estimator=lr, 
    estimatorParamMaps=paramGrid, 
    evaluator=evaluator, 
    trainRatio=0.8,
    parallelism=1, 
    seed=42)


tvsModel = tvs.fit(train_data)

print(tvsModel, '\n')
print(tvsModel.explainParams(), '\n')
print('\nEvaluating with metric -> {}'.format(evaluator.getMetricName()))
print('Evaluators rating for the test set -> {}'.format(evaluator.evaluate(tvsModel.transform(test_data))))


TrainValidationSplitModel_3347ceac472f 

estimator: estimator to be cross-validated (current: LogisticRegression_66d09876f33b)
estimatorParamMaps: estimator param maps (current: [{Param(parent='LogisticRegression_66d09876f33b', name='maxIter', doc='max number of iterations (>= 0).'): 10, Param(parent='LogisticRegression_66d09876f33b', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_66d09876f33b', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}, {Param(parent='LogisticRegression_66d09876f33b', name='maxIter', doc='max number of iterations (>= 0).'): 10, Param(parent='LogisticRegression_66d09876f33b', name='regParam', doc='regularization parameter (>= 0).'): 0.01, Param(parent='LogisticRegression_66d09876f33b', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an 

In [73]:
print('Train Validation Split Model validationMetrics -> {}'.format(tvsModel.validationMetrics))
print('Best Model -> {}'.format(tvsModel.bestModel))

Train Validation Split Model validationMetrics -> [0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.7272727272727273, 0.7272727272727273, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9090909090909091, 0.9545454545454546, 0.9090909090909091, 0.7272727272727273, 0.7272727272727273]
Best Model -> LogisticRegressionModel: uid=LogisticRegression_66d09876f33b, numClasses=2, numFeatures=38


### Testing

In [74]:
predictions = tvsModel.bestModel.transform(test_data)
predictions.select("Conversation_ID","Conversation_Type", "Prediction", "Probability").where(predictions.Prediction != predictions.Conversation_Type).show(10, truncate=False)

+---------------+-----------------+----------+----------------------------------------+
|Conversation_ID|Conversation_Type|Prediction|Probability                             |
+---------------+-----------------+----------+----------------------------------------+
|ALtIzDsxTHTxIe |0                |1.0       |[0.31900415970589047,0.6809958402941095]|
|GIjmhRS9Dy8xg1 |0                |1.0       |[0.31659662948792916,0.6834033705120708]|
|OHJFQTvA2eXtsV |1                |0.0       |[0.5618821110539218,0.43811788894607817]|
|qEKhQ6c7Jibf99 |0                |1.0       |[0.47984795348828563,0.5201520465117144]|
+---------------+-----------------+----------+----------------------------------------+



In [75]:
model_path = "../Models/Trained_Models/LogisticRegression"

In [76]:
# tvsModel.write().overwrite().save(model_path)

## RANDOM FOREST

### Training Random Forest Model

In [77]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder



# Define Random Forest Classifier
rf = RandomForestClassifier(labelCol="Conversation_Type", featuresCol="combined_features", predictionCol="Prediction")

evaluator = MulticlassClassificationEvaluator(labelCol='Conversation_Type', metricName="accuracy", predictionCol="Prediction")

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30, 100]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.maxBins, [32, 64, 128]) \
    .build()
    
    
# This is similar to cross validation but only split the set once
tvs = TrainValidationSplit(
    estimator=rf, 
    estimatorParamMaps=paramGrid, 
    evaluator=evaluator, 
    trainRatio=0.8,
    parallelism=1, 
    seed=42)

tvsModel = tvs.fit(train_data)

print(tvsModel, '\n')
print(tvsModel.explainParams(), '\n')
print('\nEvaluating with metric -> {}'.format(evaluator.getMetricName()))
print('Evaluators rating for the test set -> {}'.format(evaluator.evaluate(tvsModel.transform(test_data))))


TrainValidationSplitModel_df0c3736370c 

estimator: estimator to be cross-validated (current: RandomForestClassifier_b3499531baf4)
estimatorParamMaps: estimator param maps (current: [{Param(parent='RandomForestClassifier_b3499531baf4', name='numTrees', doc='Number of trees to train (>= 1).'): 10, Param(parent='RandomForestClassifier_b3499531baf4', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5, Param(parent='RandomForestClassifier_b3499531baf4', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 32}, {Param(parent='RandomForestClassifier_b3499531baf4', name='numTrees', doc='Number of trees to train (>= 1).'): 10, Param(parent='RandomForestClassifier_b3499531baf4', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 int

In [78]:
print('Train Validation Split Model validationMetrics -> {}'.format(tvsModel.validationMetrics))
print('Best Model -> {}'.format(tvsModel.bestModel))

Train Validation Split Model validationMetrics -> [0.8181818181818182, 0.8181818181818182, 0.8181818181818182, 0.7727272727272727, 0.7727272727272727, 0.7727272727272727, 0.7727272727272727, 0.7727272727272727, 0.7727272727272727, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8181818181818182, 0.8181818181818182, 0.8181818181818182, 0.7272727272727273, 0.7272727272727273, 0.7272727272727273, 0.7272727272727273, 0.7272727272727273, 0.7272727272727273, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636, 0.8636363636363636]
Best Model -> RandomForestClassificationModel: uid=RandomForestClassifier_b3499531baf4, numTrees=20, numClasses=2, numFeatures=38


### Testing

In [79]:
predictions = tvsModel.bestModel.transform(test_data)
predictions.select("Conversation_ID","Conversation_Type", "Prediction", "Probability").where(predictions.Prediction != predictions.Conversation_Type).show(10, truncate=False)

+---------------+-----------------+----------+----------------------------------------+
|Conversation_ID|Conversation_Type|Prediction|Probability                             |
+---------------+-----------------+----------+----------------------------------------+
|ALtIzDsxTHTxIe |0                |1.0       |[0.4429319648773936,0.5570680351226064] |
|DjQIAADidtJ3sU |1                |0.0       |[0.546780462184874,0.45321953781512603] |
|GIjmhRS9Dy8xg1 |0                |1.0       |[0.4028177731583426,0.5971822268416573] |
|OS6dP0SUxrOJVh |0                |1.0       |[0.49052114552114556,0.5094788544788544]|
|RydOJX7W9kvYQc |0                |1.0       |[0.37070335125199866,0.6292966487480014]|
|VBwnvERVuU3jv9 |0                |1.0       |[0.3358907121407121,0.6641092878592879] |
|j2Q9T9FmncM5Re |0                |1.0       |[0.46969108105900564,0.5303089189409944]|
|qEKhQ6c7Jibf99 |0                |1.0       |[0.37109431315003605,0.6289056868499641]|
+---------------+---------------

In [80]:
model_path = "../Models/Trained_Models/RandomForest"

In [81]:
# tvsModel.write().overwrite().save(model_path)

## GRADIENT BOOSTED TREES

### Training Gradient Boosted Trees Model

In [82]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Create GBT classifier
gbt = GBTClassifier(labelCol="Conversation_Type", featuresCol="combined_features", predictionCol="Prediction")

# Hyperparameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 4, 6]) \
    .addGrid(gbt.stepSize, [0.1, 0.3, 0.5]) \
    .addGrid(gbt.maxIter, [50, 100]) \
    .build()

# Cross validation
tvs = TrainValidationSplit(
    estimator=gbt,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator(labelCol="Conversation_Type"),
    trainRatio=0.8,
    parallelism=1,
    seed=42 
)

# Train model using TrainValidationSplit
tvsModel = tvs.fit(train_data)

print(tvsModel, '\n')
print(tvsModel.explainParams(), '\n')
print('\nEvaluating with metric -> {}'.format(evaluator.getMetricName()))
print('Evaluators rating for the test set -> {}'.format(evaluator.evaluate(tvsModel.transform(test_data))))

TrainValidationSplitModel_fb91dc694da4 

estimator: estimator to be cross-validated (current: GBTClassifier_d6e29f84366b)
estimatorParamMaps: estimator param maps (current: [{Param(parent='GBTClassifier_d6e29f84366b', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2, Param(parent='GBTClassifier_d6e29f84366b', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.1, Param(parent='GBTClassifier_d6e29f84366b', name='maxIter', doc='max number of iterations (>= 0).'): 50}, {Param(parent='GBTClassifier_d6e29f84366b', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2, Param(parent='GBTClassifier_d6e29f84366b', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shr

In [83]:
print('Train Validation Split Model validationMetrics -> {}'.format(tvsModel.validationMetrics))
print('Best Model -> {}'.format(tvsModel.bestModel))

Train Validation Split Model validationMetrics -> [0.9791666666666666, 0.9791666666666666, 0.9583333333333334, 0.9791666666666666, 0.96875, 0.9583333333333334, 0.96875, 0.9479166666666666, 0.9479166666666666, 0.9270833333333334, 0.9270833333333334, 0.90625, 0.9270833333333334, 0.9270833333333334, 0.9270833333333334, 0.9479166666666666, 0.9479166666666666, 0.9479166666666666, 0.8489583333333334, 0.859375, 0.84375, 0.8489583333333333, 0.8489583333333333, 0.8489583333333333, 0.8489583333333334, 0.8489583333333334, 0.8489583333333334]
Best Model -> GBTClassificationModel: uid = GBTClassifier_d6e29f84366b, numTrees=50, numClasses=2, numFeatures=38


### Testing

In [84]:
predictions = tvsModel.bestModel.transform(test_data)
predictions.select("Conversation_ID","Conversation_Type", "Prediction", "Probability").where(predictions.Prediction != predictions.Conversation_Type).show(10, truncate=False)

+---------------+-----------------+----------+----------------------------------------+
|Conversation_ID|Conversation_Type|Prediction|Probability                             |
+---------------+-----------------+----------+----------------------------------------+
|ALtIzDsxTHTxIe |0                |1.0       |[0.19550410471619561,0.8044958952838044]|
|GIjmhRS9Dy8xg1 |0                |1.0       |[0.19306684868503732,0.8069331513149627]|
|OHJFQTvA2eXtsV |1                |0.0       |[0.8807252323571514,0.11927476764284861]|
|PfjocwPlJ3gqF5 |1                |0.0       |[0.8353781963074045,0.1646218036925955] |
+---------------+-----------------+----------+----------------------------------------+



In [85]:
model_path = "../Models/Trained_Models/GradientBoostedTrees"

In [86]:
# tvsModel.write().overwrite().save(model_path)