<a href="https://colab.research.google.com/github/IsaacFigNewton/Smishing-Detector/blob/main/Smishing_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TODO

<ol>
  <li>Reimplement the TF-IDF scoring system; Score = (token TF-IDF scores from ham) - (token TF-IDF scores from spam).</li>
  <li>Use regexes to parse, score, then replace emails, phone numbers, and URLs with a dummy string for later NLP.</li>
  <li>Handle common symbol replacements and letter substitutions.</li>
  <li>Use stemming and lemmatization to reduce the token vector space.</li>
</ol>

# Import and config

In [2]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [3]:
import random
import numpy as np
import matplotlib as plt
import pandas as pd
import nltk as nlp
# Use BERT as the sentence encoder since it's the best open-source option
from sentence_transformers import SentenceTransformer

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import *
from sklearn.metrics.pairwise import cosine_similarity,\
                                     cosine_distances

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans,\
                            SpectralClustering

  from tqdm.autonotebook import tqdm, trange


In [4]:
class_map = {"ham": 0, "spam": 1}
color_map = {0: "red", 0: "blue"}

char_ngrams = (1, 6)
word_ngrams = (1, 5)
minSusLen = 500
lenWeight = 0.01

In [5]:
np.random.seed(10)
random_state = 0

In [None]:
# Load a pretrained Sentence Transformer model
sentence_embedder = SentenceTransformer("all-MiniLM-L6-v2")

# # create the vectorizer
# vec = TfidfVectorizer(ngram_range=word_ngrams, strip_accents="ascii") #, min_df=0.01, max_df=0.99)

#Import and clean data

## Important Functions

In [30]:
def prune_tokens(dict, minFreq, maxLen):
    tokensToRemove = []

    for key in dict.keys():
        if (dict[key] <= minFreq or maxLen <= (len(key) and " " not in key)):
            # add it to a list of tokens to prune
            tokensToRemove.append(key)

    for token in tokensToRemove:
        del dict[token]

    return dict

## Import, Clean data

In [31]:
# nltk.download()
corpus = pd.read_csv("https://raw.githubusercontent.com/IsaacFigNewton/Smishing-Detector/sklearn-approach/dataset/SMSSpamCollection.txt", sep="\t", on_bad_lines='warn')

# clean the corpus
corpus = corpus.dropna(axis=0)

# clean and prepare the dataset
corpus.columns = ["class", "text"]
classifications = corpus["class"]
corpus["class"] = classifications.map(class_map)
corpus = corpus.loc[:1000]

In [32]:
# # fit the vectorizer
# vec.fit(corpus["text"])

# # vectorize the corpus
# corpus_vectorization = vec.transform(corpus["text"])

# calculate embeddings by calling model.encode()
# corpus_vectorization = sentence_embedder.encode(corpus["text"])

# Data Exploration

# Classification

## Models

### Unsupervised

In [33]:
# takes in a pd.Series, pd.DataFrame and returns an np.array
knn_pipeline = make_pipeline(
    KNeighborsClassifier(n_neighbors=10)
)

In [34]:
# takes in a pd.Series, pd.DataFrame and returns an np.array
km_pipeline = make_pipeline(
    KMeans(n_clusters=2, random_state=random_state)
)

In [35]:
# takes in a pd.Series, pd.DataFrame and returns an np.array
spectral_pipeline = make_pipeline(
    SpectralClustering(n_clusters=2, random_state=random_state)
)

### Supervised

In [36]:
# takes in a pd.Series, pd.DataFrame and returns an np.array
nb_pipeline = make_pipeline(
    GaussianNB()
)

## Important Functions

In [37]:
def unsupervised_predictions(corpus_vects, corpus_classes):
    # print(corpus_vects.shape[0])

    # get predictions
    km_predictions = pd.Series(data = km_pipeline.fit_predict(corpus_vects))\
                          .astype(int)
    spectral_predictions = pd.Series(data = spectral_pipeline.fit_predict(corpus_vects))\
                          .astype(int)

    # combine into 1 dataframe and cast to correct type
    predictions = pd.DataFrame({
        "K-Means Clustering": km_predictions,
        "Spectral Clustering": spectral_predictions
    })

    # print(predictions.head())

    return predictions

In [49]:
def supervised_predictions(training_vects, training_classes, testing_vects, testing_classes):
    # print(training_vects.shape[0])

    # get predictions
    knn_pipeline.fit(training_vects, training_classes)
    knn_predictions = pd.Series(data = knn_pipeline.predict(testing_vects))\
                          .astype(int)
    nb_pipeline.fit(training_vects, training_classes)
    nb_predictions = pd.Series(data = knn_pipeline.predict(testing_vects))\
                          .astype(int)

    # combine into 1 dataframe and cast to correct type
    predictions = pd.DataFrame({
        "K Nearest Neighbor": knn_predictions,
        "Naive Bayes": nb_predictions
    })

    # print(predictions.head())

    return predictions

In [39]:
def get_spam_scores(training_vects, training_classes, testing_vects, testing_classes):
    bias = 0.1

    # #get a mask of the sms's longer than minSusLen
    # text_len_mask = test["text"].apply(len).ge(minSusLen)
    # #add the excess length * the weight to the total score
    # scores[text_len_mask] = scores[text_len_mask].add((test["text"]).multiply(lenWeight))

    # # vectorize the training and test data using the same vectorizer as for creating corpus_vectorization
    # training_vects = vec.transform(train["text"])
    # training_classes = train["class"]
    # testing_vects = vec.transform(test["text"])
    # testing_classes = test["class"]

    # baseline = pd.DataFrame(data=[np.zeros((len(testing_classes.shape[2]), 2))], columns=["baseline"])

    # get mean of unsupervised models' predictions
    unsupervised = unsupervised_predictions(testing_vects, testing_classes)  #.mean(axis=1)

    # get mean of unsupervised models' predictions
    supervised = supervised_predictions(training_vects, training_classes, testing_vects, testing_classes)  #.mean(axis=1)

    # get scores for all models
    scores = pd.concat([unsupervised, supervised], axis=1, ignore_index=True).dropna()
    # scores = pd.concat([scores, baseline], axis=1, ignore_index=True).dropna()
    scores = scores.add(bias)

    # rename the columns
    scores.columns = list(unsupervised.columns) + list(supervised.columns)  # + ["baseline"]

    scores[scores > 0.5] = 1
    scores[scores < 0.5] = 0
    return scores.astype(int)

## Do stuff

In [43]:
# get training, test data
train, test = train_test_split(corpus, test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train_vects = sentence_embedder.encode(train["text"])
test_vects = sentence_embedder.encode(test["text"])

In [50]:
# get unsupervised classifier predictions
predictions = get_spam_scores(train_vects, train["class"], test_vects, test["class"])
predictions.head()



Unnamed: 0,K-Means Clustering,Spectral Clustering,K Nearest Neighbor,Naive Bayes
0,1,0,0,0
1,1,0,0,0
2,0,1,1,1
3,1,0,0,0
4,0,1,1,1


In [51]:
actual = pd.DataFrame(data=corpus["class"])
actual.columns = ["actual"]
actual.head()

Unnamed: 0,actual
0,0
1,1
2,0
3,0
4,1


In [52]:
expected_and_predictions = pd.concat([actual, predictions], axis=1, ignore_index=True).dropna()
expected_and_predictions.columns = ["actual"] + list(predictions.columns)
expected_and_predictions

Unnamed: 0,actual,K-Means Clustering,Spectral Clustering,K Nearest Neighbor,Naive Bayes
0,0,1.0,0.0,0.0,0.0
1,1,1.0,0.0,0.0,0.0
2,0,0.0,1.0,1.0,1.0
3,0,1.0,0.0,0.0,0.0
4,1,0.0,1.0,1.0,1.0
...,...,...,...,...,...
196,0,0.0,1.0,1.0,1.0
197,0,0.0,1.0,0.0,0.0
198,0,1.0,0.0,0.0,0.0
199,0,1.0,0.0,0.0,0.0


# Model Comparison

In [53]:
def format(stat):
    return "\t" + str('%.3f'%stat)

In [54]:
# get scoring metrics
for model_name in sorted(list(set(expected_and_predictions.columns) - {"actual"})):
    scores = {
        "mse\t": mean_squared_error(expected_and_predictions["actual"], expected_and_predictions[model_name]),
        "mae\t":  mean_absolute_error(expected_and_predictions["actual"], expected_and_predictions[model_name]),
        "accuracy": accuracy_score(expected_and_predictions["actual"], expected_and_predictions[model_name]),
        "precision": precision_score(expected_and_predictions["actual"], expected_and_predictions[model_name]),
        "recall\t":  recall_score(expected_and_predictions["actual"], expected_and_predictions[model_name]),
        "f1\t":  f1_score(expected_and_predictions["actual"], expected_and_predictions[model_name])
    }

    print(model_name + " scores:")

    for metric, value in scores.items():
        print(metric + format(value))

    print()

K Nearest Neighbor scores:
mse		0.289
mae		0.289
accuracy	0.711
precision	0.143
recall		0.152
f1		0.147

K-Means Clustering scores:
mse		0.567
mae		0.567
accuracy	0.433
precision	0.165
recall		0.606
f1		0.260

Naive Bayes scores:
mse		0.289
mae		0.289
accuracy	0.711
precision	0.143
recall		0.152
f1		0.147

Spectral Clustering scores:
mse		0.413
mae		0.413
accuracy	0.587
precision	0.153
recall		0.333
f1		0.210



In [55]:
# # print(corpus.head())
# sms = "free money click here"
# print(sms)
#
# prediction = get_spam_score(pd.Series(data=[sms]), corpus)[0]
# if 0 < prediction:
#     print("The sms is spam")
# else:
#     print("The sms is ham")