In [1]:
import findspark
findspark.init()
findspark.find()

'H:\\SPARK'

In [2]:
from pyspark.sql import SparkSession

# Initialize SparkSession with necessary configurations
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Spark') \
    .config("spark.driver.memory", "15g") \
    .config("spark.hadoop.home.dir", "H:/HADOOP/") \
    .config("spark.hadoop.conf.dir", "H:/HADOOP/etc/hadoop/") \
    .getOrCreate()
    
import sys
sys.path.append("G:\Dissertation_Project")

# Get SparkContext from the SparkSession
sc = spark.sparkContext


In [3]:
spark

### BASE DATASET

In [4]:
base_df = spark.read.csv("../../Data/Custom_Datasets/conversation_datasets_GPT.csv", header=True, inferSchema=True)
base_df.show(10, truncate=False)

+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|Conversation_ID|Attacker_Helper                                                                                                                                                 |Victim                                                                                                                                                                                         |Conversation_Type|
+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------

### PREPROCESSED DATASET

In [6]:
preprocessed_df = spark.read.csv("../../Data/Preprocessed_Datasets/DATASET_FINAL_PREPROCESSED.csv", header=True, inferSchema=True)
preprocessed_df.show(10, truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Convert Conversation Columns into actual Arrays

In [7]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import ast

# UDF to convert string representation of list to actual list
def str_to_array_of_arrays(s):
    # Convert the string to a list and then wrap it inside another list
    return [ast.literal_eval(s)][0]

str_to_array_of_arrays_udf = udf(str_to_array_of_arrays, ArrayType(ArrayType(StringType())))

df = preprocessed_df.withColumn("Attacker_Helper", str_to_array_of_arrays_udf(preprocessed_df["Attacker_Helper"])).withColumn("Victim", str_to_array_of_arrays_udf(preprocessed_df["Victim"]))

df.printSchema()

root
 |-- Conversation_ID: string (nullable = true)
 |-- Attacker_Helper: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Victim: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Conversation_Type: integer (nullable = true)



### Loading the pipeline and transforming the data

In [8]:
from pyspark.ml import PipelineModel
from src.CustonTransformers import FlattenTransformer

pipeline_model_path = "./Pipelines/TF-IDF_Pipeline"

pipeline = PipelineModel.load(path=pipeline_model_path)

df_assembled = pipeline.transform(df)

### Splitting the Data

In [9]:
(train_data, test_data) = df_assembled.randomSplit([0.8, 0.2], seed=42)
train_data.show(truncate=False)

+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# LOGISTIC REGRESSION

### Train the Logistic Regression Model

In [17]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lr = LogisticRegression(featuresCol="combined_features", labelCol="Conversation_Type", predictionCol="Prediction")

evaluator = MulticlassClassificationEvaluator(labelCol='Conversation_Type', metricName="weightedRecall", predictionCol="Prediction")


paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [40, 50, 65]) \
    .addGrid(lr.regParam, [0.001, 0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.4, 0.6]) \
    .build()


# Cross Validate with weightedRecall as a metric
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=7) # high number of num folds because the dataset is small


cvModel = crossval.fit(train_data)

print(cvModel, '\n')
print(cvModel.explainParams(), '\n')


CrossValidatorModel_c4f771861b6b 

estimator: estimator to be cross-validated (current: LogisticRegression_adc2e65c15a8)
estimatorParamMaps: estimator param maps (current: [{Param(parent='LogisticRegression_adc2e65c15a8', name='maxIter', doc='max number of iterations (>= 0).'): 40, Param(parent='LogisticRegression_adc2e65c15a8', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_adc2e65c15a8', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}, {Param(parent='LogisticRegression_adc2e65c15a8', name='maxIter', doc='max number of iterations (>= 0).'): 40, Param(parent='LogisticRegression_adc2e65c15a8', name='regParam', doc='regularization parameter (>= 0).'): 0.001, Param(parent='LogisticRegression_adc2e65c15a8', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 p

In [18]:
# Print the best model's parameters
bestModelParams = cvModel.bestModel.extractParamMap()
for param, value in bestModelParams.items():
    print(f"{param.name}: {value}")

aggregationDepth: 2
elasticNetParam: 0.0
family: auto
featuresCol: combined_features
fitIntercept: True
labelCol: Conversation_Type
maxBlockSizeInMB: 0.0
maxIter: 40
predictionCol: Prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
regParam: 0.01
standardization: True
threshold: 0.5
tol: 1e-06


### Evaluating The Models Performance on the Test set across various metrics

In [19]:
# The model is trained with weightedRecall in mind 
metrics = ['accuracy', 'f1', 'weightedPrecision', 'weightedRecall']
for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(labelCol='Conversation_Type', metricName=metric, predictionCol="Prediction")   
    score = evaluator.evaluate(cvModel.transform(test_data))
    print(f"Metric: {metric}---\t\t----Score: {score}")

Metric: accuracy---		----Score: 0.881203007518797
Metric: f1---		----Score: 0.8812854558819472
Metric: weightedPrecision---		----Score: 0.8813723740609921
Metric: weightedRecall---		----Score: 0.8812030075187971


In [20]:
print('Best Model -> {}'.format(cvModel.bestModel))

Best Model -> LogisticRegressionModel: uid=LogisticRegression_adc2e65c15a8, numClasses=2, numFeatures=400


### Testing

In [21]:
predictions = cvModel.bestModel.transform(test_data)
predictions.select("Conversation_ID","Conversation_Type", "Prediction", "Probability").where(predictions.Prediction != predictions.Conversation_Type).show(10, truncate=False)

+----------------+-----------------+----------+----------------------------------------+
|Conversation_ID |Conversation_Type|Prediction|Probability                             |
+----------------+-----------------+----------+----------------------------------------+
|15QrCAymRGylbW_4|1                |0.0       |[0.7489297230147235,0.2510702769852765] |
|1SRiSLtLPn7sZO_7|1                |0.0       |[0.7633620929614428,0.23663790703855725]|
|3FlC6MYMCNTthW_2|0                |1.0       |[0.36586503321231245,0.6341349667876875]|
|4AkbCOAdB7rqET_4|1                |0.0       |[0.7755064925014655,0.22449350749853447]|
|4JkuHn88WcQ1hG_3|1                |0.0       |[0.5766074759927813,0.4233925240072187] |
|54Dn6rG5s2WKFO_0|1                |0.0       |[0.5650978936349929,0.43490210636500715]|
|595hyYVqXn5muC_0|1                |0.0       |[0.7181657379551741,0.28183426204482587]|
|6FTLDxwJTGfIGQ_3|1                |0.0       |[0.5385303073139643,0.46146969268603566]|
|7oGx9hSdOxM4eR_4|1  

In [22]:
model_path = "../Models/Trained_Models/LogisticRegression"
print(type(model_path))

<class 'str'>


In [23]:
cvModel.write().overwrite().save(model_path)