In [22]:
# Install required packages
!pip install pyspark nltk datasets



In [1]:
# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Initialize Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ScamDetection") \
    .getOrCreate()

In [3]:
# Import required libraries
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import lower, regexp_replace, udf
from pyspark.sql.types import ArrayType, StringType
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from nltk.stem import PorterStemmer

In [4]:
# Create lemmatizer UDF
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

@udf(returnType=ArrayType(StringType()))
def lemmatize_text(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.strip()]
    return ' '.join(lemmatized_tokens)

In [5]:
splits = {'train': 'scam-dialogue_train.csv', 'test': 'scam-dialogue_test.csv'}

# Load the dataset using Hugging Face's datasets library
dataset = load_dataset("BothBosu/scam-dialogue", split="train")

# Convert the Hugging Face dataset to a Spark DataFrame
train_df = spark.createDataFrame(dataset.to_pandas())

# Display first few rows
train_df.show(5, truncate=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
# Preprocess training data
# Text preprocessing
train_df = train_df.withColumn("dialogue", lower(train_df.dialogue))
train_df = train_df.withColumn("dialogue", regexp_replace(train_df.dialogue, "[^a-zA-Z\\s]", ""))


In [7]:
# Create and apply tokenization pipeline
tokenizer = Tokenizer(inputCol="dialogue", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
preprocessing_pipeline = Pipeline(stages=[tokenizer, remover])
train_df = preprocessing_pipeline.fit(train_df).transform(train_df)

In [8]:
# Apply lemmatization
train_df = train_df.withColumn("lemmatized", lemmatize_text("filtered_words"))

In [9]:
# Display preprocessed data
train_df.select("lemmatized").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
# Create TF-IDF features
countVectorizer = CountVectorizer(inputCol="lemmatized", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
feature_pipeline = Pipeline(stages=[countVectorizer, idf])

In [11]:
# Fit and transform the feature pipeline
feature_model = feature_pipeline.fit(train_df)
train_features = feature_model.transform(train_df)

In [12]:
# Train Random Forest model
rf = RandomForestClassifier(labelCol="label",
                          featuresCol="features",
                          numTrees=50,
                          seed=42)

model = rf.fit(train_features)

In [13]:
# Load and preprocess test data
test_dataset = load_dataset("BothBosu/scam-dialogue", split="test")
# Convert the Hugging Face dataset to a Spark DataFrame
test_df = spark.createDataFrame(test_dataset.to_pandas())

In [14]:
# Apply same preprocessing steps
test_df = test_df.withColumn("dialogue", lower(test_df.dialogue))
test_df = test_df.withColumn("dialogue", regexp_replace(test_df.dialogue, "[^a-zA-Z\\s]", ""))
test_df = preprocessing_pipeline.fit(test_df).transform(test_df)
test_df = test_df.withColumn("lemmatized", lemmatize_text("filtered_words"))


In [15]:
# Create features for test data
test_features = feature_model.transform(test_df)

In [16]:
# Make predictions
predictions = model.transform(test_features)

In [17]:
# Evaluate model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

In [18]:
# Classification Metrics
def compute_classification_metrics(predictions):
    evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
    evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
    evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

    precision = evaluator_precision.evaluate(predictions)
    recall = evaluator_recall.evaluate(predictions)
    f1 = evaluator_f1.evaluate(predictions)

    confusion_matrix = predictions.groupBy("label", "prediction").count().orderBy("label", "prediction")

    return precision, recall, f1, confusion_matrix

In [19]:
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

precision, recall, f1, confusion_matrix = compute_classification_metrics(predictions)
print("\nClassification Report:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nConfusion Matrix:")
confusion_matrix.show()

Model Accuracy: 1.0

Classification Report:
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Confusion Matrix:
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|  160|
|    1|       1.0|  160|
+-----+----------+-----+



In [None]:
# Combine preprocessing, feature extraction, and model stages into a single pipeline
final_pipeline = Pipeline(stages=[
    tokenizer,       # Tokenization
    remover,         # Stopword removal
    countVectorizer, # Count Vectorizer
    idf,             # TF-IDF
    rf               # Random Forest Classifier
])

# Train the full pipeline model on the original train data
final_model = final_pipeline.fit(train_df)


In [20]:
# Function to predict on new text
def predict_scam_or_not(text):
    # Create a single row DataFrame
    data = [(text,)]
    df = spark.createDataFrame(data, ["dialogue"])

    # Preprocess the text
    df = df.withColumn("dialogue", lower(df.dialogue))
    df = df.withColumn("dialogue", regexp_replace(df.dialogue, "[^a-zA-Z\\s]", ""))

    # Apply preprocessing pipeline
    df = preprocessing_pipeline.fit(df).transform(df)
    df = df.withColumn("lemmatized", lemmatize_text("filtered_words"))

    # Create features
    df = feature_model.transform(df)

    # Make prediction
    prediction = model.transform(df)
    result = prediction.select("prediction").collect()[0][0]

    return "Scam" if result == 1 else "Non-Scam"

In [38]:
# Sample transcripts
transcripts = [
    # Scam transcript
    """
    caller: Hello, this is Professor Armin from the University of Calgary. We've detected suspicious activity related to your course 550 project.
    receiver: Suspicious activity? What do you mean?
    caller: It appears that unauthorized access has been detected. We need your student ID and password to secure your account immediately.
    receiver: That sounds alarming. Shouldn't the IT department handle this?
    caller: I'm authorized to assist. Time is critical to prevent data loss. Please provide your credentials now.
    """,
    # Scam transcript
    """
    caller: Hello, this is Dr. Thompson from the University of Calgary IT Security Division. We've detected suspicious activity.
    receiver: Suspicious activity?
    caller: Yes, we've detected multiple unauthorized login attempts. We need your student ID and password to secure your account immediately.
    receiver: This is concerning. Should I contact my supervisor about this?
    caller: There's no time for that. I need your university login credentials and student ID immediately to block these attacks and secure your research files.
    """,
    # Non-Scam transcript
    """
    receiver: Hi Professor Armin, I wanted to discuss the upcoming demo for course 550.
    professor: Of course! I'm excited to see what you and your team have developed.
    receiver: Students have been working hard, and we're ready to present to you, the TAs.
    professor: Excellent. Let's schedule the presentation in Calgary next week.
    """,
    # Non-Scam transcript
    """
    receiver: Hello Professor Armin, we're ready to showcase our course 550 project.
    professor: That's wonderful! I'm looking forward to your demo.
    receiver: We'll be presenting to you, the TAs  in Calgary.
    professor: Sounds great. Make sure to prepare thoroughly.
    """
]

In [39]:
for idx, transcript in enumerate(transcripts):
    result = predict_scam_or_not(transcript)
    print(f"Transcript {idx+1} Prediction: {result}")

Transcript 1 Prediction: Scam
Transcript 2 Prediction: Scam
Transcript 3 Prediction: Non-Scam
Transcript 4 Prediction: Non-Scam
Transcript 5 Prediction: Non-Scam


In [None]:
# Save the trained pipeline model
final_model.write().overwrite().save("scam_detection_pipeline_model")
print("Pipeline model saved as 'scam_detection_pipeline_model'")