In [43]:
from PreprocessDataset import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml.classification import *
from pyspark.ml import PipelineModel
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from scipy.sparse import hstack
import pandas
import socket
import logging
import json
import sys

sys.path.append("G:\Dissertation_Project")

In [44]:
def load_prediction_model(model_id):
    models = {
        "LogisticRegression_TFIDF": "G:\\Dissertation_Project\\src\\Models\\Trained_Models\\LogisticRegression\\bestModel",
        "RandomForest_TFIDF": "G:\\Dissertation_Project\\src\\Models\\Trained_Models\\RandomForest\\bestModel",
        "GradientBoosted_TFIDF": "G:\\Dissertation_Project\\src\\Models\\Trained_Models\\GradientBoostedTrees\\bestModel",
        "SupportVectorMachine_TFIDF": "G:\\Dissertation_Project\\src\\Models\\Trained_Models\\SupportVectorMachine\\bestModel",
        "NeuralNetwork_TFIDF": "G:\\Dissertation_Project\\src\Models\\Trained_Models\\NeuralNetwork_TFIDF\\NeuralNetwork_TFIDF.keras",
        "LSTM_NeuralNetwork_TFIDF": "G:\\Dissertation_Project\\src\\Models\\Trained_Models\\LSTM_NeuralNetwork_TFIDF\\LSTM_NeuralNetwork_TFIDF.keras",
        "NeuralNetwork_EMBEDDING": "G:\\Dissertation_Project\\src\Models\\Trained_Models\\NeuralNetwork_EMBEDDING\\NN_TEST_TFIDF.keras"
    }

    print("<--LOADING PREDICTION MODEL : {} , From location : {}-->\n".format(
        model_id, models[model_id]))

    if not isinstance(model_id, str):
        raise TypeError(model_id + " must be of type str.")

    if not model_id in models.keys():
        raise ValueError("model_id " + model_id + " does not exist.")

    try:
        match (model_id):
            case "LogisticRegression_TFIDF":
                model = LogisticRegressionModel.load(models[model_id])
                return model

            case "RandomForest_TFIDF":
                model = RandomForestClassificationModel.load(models[model_id])
                return model

            case "GradientBoosted_TFIDF":
                model = GBTClassificationModel.load(models[model_id])
                return model

            case "SupportVectorMachine_TFIDF":
                model = LinearSVCModel.load(models[model_id])
                return model

            case "NeuralNetwork_TFIDF":
                from src.CustomNNMetrics import F1Score
                model = load_model(models[model_id], custom_objects={
                                   'F1Score': F1Score})
                return model

            case "LSTM_NeuralNetwork_TFIDF":
                from src.CustomNNMetrics import F1Score
                model = load_model(models[model_id], custom_objects={
                                   'F1Score': F1Score})
                return model

            case "NeuralNetwork_EMBEDDING":
                from src.CustomNNMetrics import F1Score
                model = load_model(models[model_id], custom_objects={
                                   'F1Score': F1Score})
                return model

            case _:
                model = LogisticRegressionModel.load(
                    models["LogisticRegression_TFIDF"])
                return model

    except FileNotFoundError as e:
        print(e)
        raise

In [45]:


prediction_model = load_prediction_model("NeuralNetwork_TFIDF")

# Handle preprocessing for the neural network
num_features = 200
# Initialize HashingVectorizer to do the hashing trick
hashing_vectorizer_ah = HashingVectorizer(
    n_features=num_features, alternate_sign=False)
hashing_vectorizer_v = HashingVectorizer(
    n_features=num_features, alternate_sign=False)
# Initialize TfidfTransformer
tfidf_transformer = TfidfTransformer(
    use_idf=True, norm=None, smooth_idf=True)



<--LOADING PREDICTION MODEL : NeuralNetwork_TFIDF , From location : G:\Dissertation_Project\src\Models\Trained_Models\NeuralNetwork_TFIDF\NeuralNetwork_TFIDF.keras-->





In [46]:
import pandas as pd

df = pd.read_csv('../Data/Preprocessed_Datasets/DATASET_FINAL_PREPROCESSED.csv')
microphone_values = df['Victim'].values
loopback_values = df['Attacker_Helper'].values
labels = df['Conversation_Type'].values

microphone_data = microphone_values[0]
loopback_data = loopback_values[0]

label = labels[0]

print(microphone_data)
print(loopback_data)
print(label)

[['im', 'sorri', 'i', 'dont', 'believ', 'you', 'can', 'you', 'provid', 'me', 'with', 'your', 'badg', 'number']]
[['hi', 'thi', 'is', 'john', 'from', 'the', 'ir', 'you', 'owe', 'us', 'xxxxx', 'in', 'back', 'tax']]
1


In [47]:

data_for_logs = {
    "attacker_helper": loopback_data,
    "victim": microphone_data
}
# Concatenate texts
concatenated_microphone_data = ''.join(microphone_data)
concatenated_loopback_data = ''.join(loopback_data)

print("Concatenated_loop_data: {}\n".format(concatenated_loopback_data))
print("Concatenated_mic_data: {}\n".format(concatenated_microphone_data))

# Implementing the hashing trick
V_hashed_features = hashing_vectorizer_v.fit_transform(
    [concatenated_microphone_data])
AH_hashed_features = hashing_vectorizer_ah.fit_transform(
    [concatenated_loopback_data])

print("Hashed_loop_data: {}\n".format(AH_hashed_features))
print("Hashed_mic_data: {}\n".format(V_hashed_features))

# Apply IDF scaling
AH_tfidf_scaled = tfidf_transformer.fit_transform(
    AH_hashed_features)
V_tfidf_scaled = tfidf_transformer.fit_transform(
    V_hashed_features)

print("TFIDF_loop_data: {}\n".format(AH_tfidf_scaled))
print("TFIDF_mic_data: {}\n".format(V_tfidf_scaled))

# Combining features
combined_features = hstack(
    [AH_tfidf_scaled, V_tfidf_scaled])


# Convert to dense array and flatten
combined_features_dense = combined_features.toarray().flatten()

print("Combined Features: {}\n".format(combined_features))
print("Combined Features Dense Flattened: {}\n".format(combined_features_dense))



# Reshape the flattened array to (1, 400)
combined_features_reshaped = np.reshape(
    combined_features_dense, (1, -1))

print("Combined Features Reshaped FINAL: {}\n".format(combined_features_reshaped))
print("Combined Feature Reshaped FINAL Shape: {}\n".format(combined_features_reshaped.shape))

predictions = prediction_model.predict(
    combined_features_reshaped)
print("Predictions", predictions)
if ((1-predictions[0][0]) > 0.5):
    print("Scam detected with probability: {}".format(
        (1-predictions[0][0])))
else:
    print("Normal conversation detected with probability: {}".format(
        predictions[0][0]))

Concatenated_loop_data: [['hi', 'thi', 'is', 'john', 'from', 'the', 'ir', 'you', 'owe', 'us', 'xxxxx', 'in', 'back', 'tax']]

Concatenated_mic_data: [['im', 'sorri', 'i', 'dont', 'believ', 'you', 'can', 'you', 'provid', 'me', 'with', 'your', 'badg', 'number']]

Hashed_loop_data:   (0, 6)	0.2672612419124244
  (0, 42)	0.2672612419124244
  (0, 52)	0.2672612419124244
  (0, 72)	0.2672612419124244
  (0, 77)	0.2672612419124244
  (0, 89)	0.2672612419124244
  (0, 90)	0.2672612419124244
  (0, 96)	0.2672612419124244
  (0, 115)	0.2672612419124244
  (0, 122)	0.2672612419124244
  (0, 145)	0.2672612419124244
  (0, 158)	0.2672612419124244
  (0, 179)	0.2672612419124244
  (0, 188)	0.2672612419124244

Hashed_mic_data:   (0, 0)	0.2581988897471611
  (0, 19)	0.2581988897471611
  (0, 40)	0.2581988897471611
  (0, 55)	0.2581988897471611
  (0, 84)	0.2581988897471611
  (0, 136)	0.2581988897471611
  (0, 140)	0.2581988897471611
  (0, 156)	0.2581988897471611
  (0, 160)	0.2581988897471611
  (0, 180)	0.25819888974716

Predictions [[0.18852358]]
Scam detected with probability: 0.8114764243364334


## HERE

In [48]:
import findspark
findspark.init()
findspark.find()

'H:\\SPARK'

In [49]:
from pyspark.sql import SparkSession

# Initialize SparkSession with necessary configurations
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Spark') \
    .config("spark.driver.memory", "15g") \
    .config("spark.hadoop.home.dir", "H:/HADOOP/") \
    .config("spark.hadoop.conf.dir", "H:/HADOOP/etc/hadoop/") \
    .getOrCreate()
    
import sys
sys.path.append("G:\Dissertation_Project")

# Get SparkContext from the SparkSession
sc = spark.sparkContext


In [50]:
spark

### PREPROCESSED DATASET

In [51]:
preprocessed_df = spark.read.csv("../Data/Preprocessed_Datasets/DATASET_FINAL_PREPROCESSED.csv", header=True, inferSchema=True)
preprocessed_df.show(10, truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Convert Conversation Columns into actual Arrays

In [52]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf
import ast

# UDF to convert string representation of list to actual list
def str_to_array_of_arrays(s):
    # Convert the string to a list and then wrap it inside another list
    return [ast.literal_eval(s)][0]

str_to_array_of_arrays_udf = udf(str_to_array_of_arrays, ArrayType(ArrayType(StringType())))

df = preprocessed_df.withColumn("Attacker_Helper", str_to_array_of_arrays_udf(preprocessed_df["Attacker_Helper"])).withColumn("Victim", str_to_array_of_arrays_udf(preprocessed_df["Victim"]))

df.printSchema()

root
 |-- Conversation_ID: string (nullable = true)
 |-- Attacker_Helper: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Victim: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- Conversation_Type: integer (nullable = true)



### Loading the pipeline and transforming the data

In [53]:
from pyspark.ml import PipelineModel
from src.CustonTransformers import FlattenTransformer

pipeline_model_path = "./Models/Pipelines/TF-IDF_Pipeline"

pipeline = PipelineModel.load(path=pipeline_model_path)

df_assembled = pipeline.transform(df)

In [62]:
test_df = df_assembled.select('Conversation_ID','Combined_Features')
test_df = test_df.filter(test_df['Conversation_ID'] == 'GT1sURbxgG_2').limit(1)
test_df.show(truncate=False)

+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [63]:
features = test_df.select('Combined_Features').rdd.map(lambda row: row.Combined_Features).collect()
features_numpy_test = np.array(features)

print("Shape of -features_numpy_test- array --> {}".format(features_numpy_test.shape))
print(features_numpy_test)

Shape of -features_numpy_test- array --> (1, 400)
[[0.         0.         0.         0.         0.         0.760271
  0.         1.05954162 0.         0.03978928 0.         0.
  0.         0.         0.         0.         0.         0.20213295
  0.         0.         0.89601837 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  1.20091257 2.53313971 0.         0.         0.         0.75078102
  0.         0.63391668 0.         0.         0.         0.
  0.         0.         0.         0.04327461 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  1.48859464 0.         0.         0.08306086 0.         0.
  0.         0.         0.         0.86377918 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         1.24928616 0.
  0.         0.         0.61570124 0.         0.16404436 0.
  0.         0.         0.  

In [64]:
predictions = prediction_model.predict(features_numpy_test)
print(predictions)

[[0.9926143]]
