In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText
import gensim.downloader as api

In [None]:

# Load Dataset
data = pd.read_csv("dataset.csv")
data['Message'] = data['Message'].astype(str)

# Split the dataset into training and testing sets
X = data['Message']
y = data['EncodedClass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Tokenize and pad sequences for deep learning models
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# 1. Feature Extraction
# Bag of Words
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Word2Vec Embeddings
def tokenize(text):
    return text.split()

sentences = [tokenize(msg) for msg in X_train]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_word2vec_embeddings(texts, model):
    embeddings = []
    for text in texts:
        tokens = tokenize(text)
        vector = np.mean([model.wv[word] for word in tokens if word in model.wv] or [np.zeros(100)], axis=0)
        embeddings.append(vector)
    return np.array(embeddings)

X_train_w2v = get_word2vec_embeddings(X_train, word2vec_model)
X_test_w2v = get_word2vec_embeddings(X_test, word2vec_model)

# fastText Embeddings
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
X_train_fasttext = get_word2vec_embeddings(X_train, fasttext_model)
X_test_fasttext = get_word2vec_embeddings(X_test, fasttext_model)

# GloVe Embeddings
glove_vectors = api.load("glove-wiki-gigaword-100")
def get_glove_embeddings(texts, glove_model):
    embeddings = []
    for text in texts:
        tokens = tokenize(text)
        vector = np.mean([glove_model[word] for word in tokens if word in glove_model] or [np.zeros(100)], axis=0)
        embeddings.append(vector)
    return np.array(embeddings)

X_train_glove = get_glove_embeddings(X_train, glove_vectors)
X_test_glove = get_glove_embeddings(X_test, glove_vectors)

# BERT and RoBERTa (using Hugging Face Transformers)
from transformers import AutoTokenizer, AutoModel
import torch

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

def get_bert_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().numpy())
    return np.array(embeddings)

X_train_bert = get_bert_embeddings(X_train, bert_tokenizer, bert_model)
X_test_bert = get_bert_embeddings(X_test, bert_tokenizer, bert_model)

# 2. Model Training and Evaluation
# Helper function to train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

# SVM with TF-IDF
print("SVM with TF-IDF")
svm_model = SVC()
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

# Random Forest with Bag of Words
print("Random Forest with Bag of Words")
rf_model = RandomForestClassifier()
train_and_evaluate(rf_model, X_train_bow, X_test_bow, y_train, y_test)

# LSTM Model
print("LSTM Model")
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)
loss, accuracy = lstm_model.evaluate(X_test_pad, y_test)
print(f"LSTM Test Accuracy: {accuracy}")

# CNN Model
print("CNN Model")
cnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=100),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)
loss, accuracy = cnn_model.evaluate(X_test_pad, y_test)
print(f"CNN Test Accuracy: {accuracy}")


  from .autonotebook import tqdm as notebook_tqdm


SVM with TF-IDF
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

Random Forest with Bag of Words
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

LSTM Model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Test Accuracy: 0.8663676977157593
CNN Model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN Test Accuracy: 0.9865471124649048
