In [12]:
# Load the dataset (https://www.kaggle.com/chandramoulinaidu/spam-classification-for-basic-nlp)

import pandas as pd

df = pd.read_csv('Phishing_Email.csv')

df.head()

Unnamed: 0,SERIAL,MESSAGE,CATEGORY
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,1,the other side of * galicismos * * galicismo *...,0
2,2,re : equistar deal tickets are you still avail...,0
3,3,\r\nHello I am your hot lil horny toy.\r\n ...,1
4,4,software at incredibly low prices ( 86 % lower...,1


In [13]:
df.tail()

Unnamed: 0,SERIAL,MESSAGE,CATEGORY
18629,18646,date a lonely housewife always wanted to date ...,1
18630,18647,request submitted : access request for anita ....,0
18631,18648,"re : important - prc mtg hi dorn & john , as y...",0
18632,18649,press clippings - letter on californian utilit...,0
18633,18650,empty,1


In [14]:
df['CATEGORY'].value_counts()

CATEGORY
0    11322
1     7312
Name: count, dtype: int64

In [15]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [17]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [18]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [19]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [20]:
def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [21]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,         # 20% of data will go to the test set
    stratify=df['CATEGORY'], # Ensures proportional class distribution
    random_state=1         # Ensures reproducibility
)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

train_df, test_df

Train shape: (14907, 3)
Test shape: (3727, 3)


(       SERIAL                                            MESSAGE  CATEGORY
 0         128  blue horseshoe meet me dear reader : we someti...         1
 1        4205  the national association for honesty in medici...         1
 2        7943  web pages load 300 % faster without cable or d...         1
 3       15029  reimbursement of individually billed items the...         0
 4         609  lp deal bill - please come see me this morning...         0
 ...       ...                                                ...       ...
 14902    1175  200 summary on reduplication a month before ch...         0
 14903   11581  \r\nYannick Gingras wrote:>    I am wondering ...         0
 14904   13132  URL: http://www.newsisfree.com/click/215,9,215...         0
 14905    3430  global operations controller forum i believe n...         0
 14906   11247  electronic pay stubs get ready . beginning in ...         0
 
 [14907 rows x 3 columns],
       SERIAL                                            ME

In [22]:
import numpy as np

token_counter = {}

# Iterate over each message in the 'MESSAGE' column
for message in train_df['MESSAGE']:
    if isinstance(message, str):  # Check if message is a string
        message_as_token_lst = message_to_token_list(message)  # Convert message to a list of tokens

        # Count occurrences of each token
        for token in message_as_token_lst:
            if token in token_counter:
                token_counter[token] += 1
            else:
                token_counter[token] = 1

len(token_counter)


144709

In [23]:
token_counter

{'blue': 228,
 'horseshoe': 3,
 'meet': 897,
 'dear': 1137,
 'reader': 878,
 'sometimes': 434,
 'approach': 2218,
 'analyst': 1128,
 'thought': 1316,
 'emerging': 216,
 'market': 3782,
 'sector': 275,
 'interested': 2324,
 'certain': 1138,
 'occasion': 159,
 'come': 3012,
 'u': 13191,
 'intriguing': 49,
 'insight': 264,
 'aspect': 1778,
 'caught': 105,
 'attention': 853,
 'know': 6021,
 'track': 776,
 'record': 1079,
 'speaks': 80,
 'happy': 751,
 'bring': 907,
 'another': 2189,
 'situation': 893,
 'huge': 608,
 'upside': 52,
 'potential': 1175,
 'think': 3385,
 'could': 5083,
 'one': 14148,
 'look': 2802,
 'back': 3019,
 'shortly': 175,
 'everyone': 1038,
 'saying': 530,
 'info': 1714,
 'click': 3821,
 'remember': 1191,
 'nothing': 1158,
 'ventured': 7,
 'gained': 144,
 'national': 1589,
 'association': 1114,
 'honesty': 56,
 'medicine': 191,
 'stemcaoiwz': 1,
 'body': 833,
 'many': 4857,
 'cry': 65,
 'water': 445,
 'dr': 1865,
 'f': 2039,
 'batmanghelidj': 2,
 'title': 2118,
 'multi'

In [24]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    return token_counter[proccessed_token] > threshold

keep_token('random', 100)

True

In [25]:
features = set()

for token in token_counter:
  if keep_token(token, 1000):
    features.add(token)

features

{'0',
 '00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '07',
 '08',
 '09',
 '1',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '2',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '35',
 '3d',
 '4',
 '40',
 '44',
 '45',
 '49',
 '5',
 '50',
 '500',
 '6',
 '60',
 '7',
 '8',
 '9',
 '90',
 '95',
 '97',
 '98',
 '99',
 '_',
 'able',
 'abstract',
 'ac',
 'accepted',
 'access',
 'account',
 'acquisition',
 'act',
 'action',
 'actually',
 'ad',
 'add',
 'additional',
 'address',
 'ago',
 'agreement',
 'already',
 'also',
 'although',
 'always',
 'america',
 'american',
 'among',
 'amount',
 'analysis',
 'analyst',
 'announcement',
 'another',
 'answer',
 'anyone',
 'anything',
 'application',
 'applied',
 'approach',
 'april',
 'area',
 'argument',
 'around',
 'article',
 'ask',
 'asked',
 'aspect',
 'asset',
 'association',
 'attached',
 

In [26]:
features = list(features)
features

['corpus',
 'publication',
 'germany',
 'pp',
 'must',
 'project',
 'en',
 'point',
 'feel',
 'main',
 'receive',
 'think',
 'asset',
 'wa',
 '40',
 '1998',
 'even',
 'workshop',
 '28',
 'dollar',
 'type',
 'exchange',
 'minute',
 'found',
 'put',
 '16',
 'information',
 'line',
 'case',
 '10',
 'search',
 '23',
 'already',
 'believe',
 'others',
 'letter',
 'friday',
 'else',
 'group',
 'request',
 'december',
 'marketing',
 'less',
 'tutorial',
 'easy',
 'lot',
 'value',
 'large',
 'office',
 'part',
 'argument',
 '26',
 'japan',
 'proposal',
 'fund',
 '01',
 'income',
 'sound',
 'news',
 'yet',
 'index',
 'general',
 'return',
 'thursday',
 'l',
 '05',
 'usa',
 'remember',
 'net',
 'review',
 'based',
 '19',
 'grant',
 'work',
 'today',
 'term',
 'woman',
 'single',
 '100',
 '27',
 'free',
 '08',
 'certain',
 'effect',
 'user',
 'david',
 'basis',
 'idea',
 'need',
 'clear',
 'operation',
 'february',
 'several',
 'c',
 'company',
 'material',
 'level',
 'german',
 'f',
 'made',
 'p

In [27]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'corpus': 0,
 'publication': 1,
 'germany': 2,
 'pp': 3,
 'must': 4,
 'project': 5,
 'en': 6,
 'point': 7,
 'feel': 8,
 'main': 9,
 'receive': 10,
 'think': 11,
 'asset': 12,
 'wa': 13,
 '40': 14,
 '1998': 15,
 'even': 16,
 'workshop': 17,
 '28': 18,
 'dollar': 19,
 'type': 20,
 'exchange': 21,
 'minute': 22,
 'found': 23,
 'put': 24,
 '16': 25,
 'information': 26,
 'line': 27,
 'case': 28,
 '10': 29,
 'search': 30,
 '23': 31,
 'already': 32,
 'believe': 33,
 'others': 34,
 'letter': 35,
 'friday': 36,
 'else': 37,
 'group': 38,
 'request': 39,
 'december': 40,
 'marketing': 41,
 'less': 42,
 'tutorial': 43,
 'easy': 44,
 'lot': 45,
 'value': 46,
 'large': 47,
 'office': 48,
 'part': 49,
 'argument': 50,
 '26': 51,
 'japan': 52,
 'proposal': 53,
 'fund': 54,
 '01': 55,
 'income': 56,
 'sound': 57,
 'news': 58,
 'yet': 59,
 'index': 60,
 'general': 61,
 'return': 62,
 'thursday': 63,
 'l': 64,
 '05': 65,
 'usa': 66,
 'remember': 67,
 'net': 68,
 'review': 69,
 'based': 70,
 '19': 71,
 

In [28]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [29]:
# "Bag of Words" (counts vector)

# ->  http  tr  size  3d  font  br  com  td   p   b
# ->    0    1    2    3   4    5    6    7   8   9
# ->   [0,   0,   0,   1,  2,   1,   2,   0,  0,  1]

[0.,  0.,  0.,   1., 2.,  1., 2.,  0., 0., 1.]

[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0]

In [30]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1

  return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [31]:
message_to_count_vector(train_df['MESSAGE'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0.

In [32]:
train_df.iloc[3]

SERIAL                                                  15029
MESSAGE     reimbursement of individually billed items the...
CATEGORY                                                    0
Name: 3, dtype: object

In [33]:
def df_to_X_y(dff):
  y = dff['CATEGORY'].to_numpy().astype(int)

  message_col = dff['MESSAGE']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [34]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14907, 802), (14907,), (3727, 802), (3727,))

In [35]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
# **TF-IDF Feature Extraction**

from sklearn.feature_extraction.text import TfidfVectorizer

def custom_tokenizer(message):
    return message_to_token_list(message)

tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['MESSAGE'])

X_test_tfidf = tfidf_vectorizer.transform(test_df['MESSAGE'])

y_train_tfidf = train_df['CATEGORY'].to_numpy().astype(int)
y_test_tfidf = test_df['CATEGORY'].to_numpy().astype(int)


feature_names = tfidf_vectorizer.get_feature_names_out()

X_train_tfidf_dense = X_train_tfidf.toarray()

# View the dense matrix (first 5 rows, for example)
print(X_train_tfidf_dense[:1])





[[0. 0. 0. ... 0. 0. 0.]]


In [37]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import classification_report

# Prepare tokenized sentences from the training data
train_sentences = [message_to_token_list(msg) for msg in train_df['MESSAGE'] if isinstance(msg, str)]
test_sentences = [message_to_token_list(msg) for msg in test_df['MESSAGE'] if isinstance(msg, str)]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4, seed=1)

# Function to convert a message to a Word2Vec vector (averaged over all tokens)
def message_to_wv_vector(message, model, vector_size):
    tokens = message_to_token_list(message)
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if len(vectors) == 0:  # If no tokens are found in the model, return a zero vector
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

# Extract Word2Vec features for training and test sets
vector_size = word2vec_model.vector_size
wv_X_train = np.array([message_to_wv_vector(msg, word2vec_model, vector_size) for msg in train_df['MESSAGE']])
wv_X_test = np.array([message_to_wv_vector(msg, word2vec_model, vector_size) for msg in test_df['MESSAGE']])

# Labels remain the same
wv_y_train = train_df['CATEGORY'].to_numpy().astype(int)
wv_y_test = test_df['CATEGORY'].to_numpy().astype(int)

# Verify the shape of the feature matrices
print("Word Vector Feature Matrix Shapes:")
print("Training Set:", wv_X_train.shape)
print("Test Set:", wv_X_test.shape)

Word Vector Feature Matrix Shapes:
Training Set: (14907, 100)
Test Set: (3727, 100)


In [38]:
import numpy as np
from scipy.sparse import hstack

# Combine TF-IDF and Word Vectors into a hybrid feature
hybrid_X_train = hstack([X_train_tfidf, wv_X_train])
hybrid_X_test = hstack([X_test_tfidf, wv_X_test])

# Ensure the data is in dense format if needed for certain classifiers
hybrid_X_train = hybrid_X_train.toarray()
hybrid_X_test = hybrid_X_test.toarray()

print(f"Hybrid Train Feature Shape: {hybrid_X_train.shape}")
print(f"Hybrid Test Feature Shape: {hybrid_X_test.shape}")


Hybrid Train Feature Shape: (14907, 144809)
Hybrid Test Feature Shape: (3727, 144809)


#### Bag of Words

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))


              precision    recall  f1-score   support

           0       0.62      1.00      0.76      2265
           1       0.89      0.06      0.11      1462

    accuracy                           0.63      3727
   macro avg       0.76      0.53      0.44      3727
weighted avg       0.73      0.63      0.51      3727



In [40]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      2265
           1       0.93      0.95      0.94      1462

    accuracy                           0.95      3727
   macro avg       0.95      0.95      0.95      3727
weighted avg       0.95      0.95      0.95      3727



In [41]:
# Import necessary libraries
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_bow = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_bow.fit(X_train, y_train)

xgb_predictions_bow = xgb_bow.predict(X_test)

print("XGBoost with Bag of Words Classification Report:\n")
print(classification_report(y_test, xgb_predictions_bow))


Parameters: { "use_label_encoder" } are not used.



XGBoost with Bag of Words Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2265
           1       0.92      0.97      0.94      1462

    accuracy                           0.95      3727
   macro avg       0.95      0.96      0.95      3727
weighted avg       0.96      0.95      0.95      3727



#### TF_IDF EXTRACTION

In [42]:
# **Logistic Regression Training and Evaluation**

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Training Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train_tfidf)

# Predictions and evaluation
lr_predictions = lr_model.predict(X_test_tfidf)
print("Logistic Regression Classification Report:\n")
print(classification_report(y_test_tfidf, lr_predictions))


Training Logistic Regression...
Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2265
           1       0.95      0.97      0.96      1462

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727



In [43]:
# **Random Forest Training and Evaluation**

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Training Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train_tfidf)

# Predictions and evaluation
rf_predictions = rf_model.predict(X_test_tfidf)
print("Random Forest Classification Report:\n")
print(classification_report(y_test_tfidf, rf_predictions))


Training Random Forest...
Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2265
           1       0.95      0.96      0.96      1462

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.97      0.96      0.96      3727



In [44]:
# Import the XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train_tfidf, y_train_tfidf)

xgb_predictions = xgb_model.predict(X_test_tfidf)

print("XGBoost Classification Report:\n")
print(classification_report(y_test_tfidf, xgb_predictions))


Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2265
           1       0.93      0.98      0.95      1462

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



#### Word Vector

In [45]:
from sklearn.linear_model import LogisticRegression

wv_lr_model = LogisticRegression().fit(wv_X_train, wv_y_train)
wv_predictions = wv_lr_model.predict(wv_X_test)

# Evaluate the performance
print("Logistic Regression with Word Vectors Classification Report:\n")
print(classification_report(wv_y_test, wv_predictions))

Logistic Regression with Word Vectors Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      2265
           1       0.91      0.94      0.92      1462

    accuracy                           0.94      3727
   macro avg       0.94      0.94      0.94      3727
weighted avg       0.94      0.94      0.94      3727



In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Random Forest Classifier on Word Vectors
print("Training Random Forest on Word Vectors...")
rf_wv_model = RandomForestClassifier(random_state=1)
rf_wv_model.fit(wv_X_train, wv_y_train)

rf_wv_predictions = rf_wv_model.predict(wv_X_test)

print("Random Forest with Word Vectors Classification Report:\n")
print(classification_report(wv_y_test, rf_wv_predictions))

Training Random Forest on Word Vectors...
Random Forest with Word Vectors Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2265
           1       0.94      0.95      0.95      1462

    accuracy                           0.96      3727
   macro avg       0.95      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [47]:
# XGBoost Classifier on Word Vectors
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

print("Training XGBoost on Word Vectors...")
xgb_wv_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1)
xgb_wv_model.fit(wv_X_train, wv_y_train)

xgb_wv_predictions = xgb_wv_model.predict(wv_X_test)

print("XGBoost with Word Vectors Classification Report:\n")
print(classification_report(wv_y_test, xgb_wv_predictions))

Training XGBoost on Word Vectors...


Parameters: { "use_label_encoder" } are not used.



XGBoost with Word Vectors Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2265
           1       0.94      0.96      0.95      1462

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



#### Hybrid

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression on Hybrid Features
lr_hybrid_model = LogisticRegression()
lr_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

# Predict and evaluate
lr_hybrid_predictions = lr_hybrid_model.predict(hybrid_X_test)
print("Logistic Regression Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, lr_hybrid_predictions))


Logistic Regression Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2265
           1       0.93      0.96      0.95      1462

    accuracy                           0.96      3727
   macro avg       0.95      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [49]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest on Hybrid Features
rf_hybrid_model = RandomForestClassifier()
rf_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

# Predict and evaluate
rf_hybrid_predictions = rf_hybrid_model.predict(hybrid_X_test)
print("Random Forest Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, rf_hybrid_predictions))


Random Forest Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2265
           1       0.95      0.96      0.95      1462

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [50]:
from xgboost import XGBClassifier

# Train XGBoost on Hybrid Features
xgb_hybrid_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

# Predict and evaluate
xgb_hybrid_predictions = xgb_hybrid_model.predict(hybrid_X_test)
print("XGBoost Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, xgb_hybrid_predictions))

Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2265
           1       0.95      0.97      0.96      1462

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727

