In [1]:
# Load the dataset (https://www.kaggle.com/chandramoulinaidu/spam-classification-for-basic-nlp)

import pandas as pd

df = pd.read_csv('Phishing_Email.csv')

df.head()

Unnamed: 0,SERIAL,MESSAGE,CATEGORY
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,1,the other side of * galicismos * * galicismo *...,0
2,2,re : equistar deal tickets are you still avail...,0
3,3,\r\nHello I am your hot lil horny toy.\r\n ...,1
4,4,software at incredibly low prices ( 86 % lower...,1


In [2]:
df.tail()

Unnamed: 0,SERIAL,MESSAGE,CATEGORY
18629,18646,date a lonely housewife always wanted to date ...,1
18630,18647,request submitted : access request for anita ....,0
18631,18648,"re : important - prc mtg hi dorn & john , as y...",0
18632,18649,press clippings - letter on californian utilit...,0
18633,18650,empty,1


In [3]:
df['CATEGORY'].value_counts()

CATEGORY
0    11322
1     7312
Name: count, dtype: int64

In [4]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [6]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [7]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [8]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [9]:
def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [10]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,         # 20% of data will go to the test set
    stratify=df['CATEGORY'], # Ensures proportional class distribution
    random_state=1         # Ensures reproducibility
)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

train_df, test_df

Train shape: (14907, 3)
Test shape: (3727, 3)


(       SERIAL                                            MESSAGE  CATEGORY
 0         128  blue horseshoe meet me dear reader : we someti...         1
 1        4205  the national association for honesty in medici...         1
 2        7943  web pages load 300 % faster without cable or d...         1
 3       15029  reimbursement of individually billed items the...         0
 4         609  lp deal bill - please come see me this morning...         0
 ...       ...                                                ...       ...
 14902    1175  200 summary on reduplication a month before ch...         0
 14903   11581  \r\nYannick Gingras wrote:>    I am wondering ...         0
 14904   13132  URL: http://www.newsisfree.com/click/215,9,215...         0
 14905    3430  global operations controller forum i believe n...         0
 14906   11247  electronic pay stubs get ready . beginning in ...         0
 
 [14907 rows x 3 columns],
       SERIAL                                            ME

In [11]:
import numpy as np

token_counter = {}

# Iterate over each message in the 'MESSAGE' column
for message in train_df['MESSAGE']:
    if isinstance(message, str):  # Check if message is a string
        message_as_token_lst = message_to_token_list(message)  # Convert message to a list of tokens

        # Count occurrences of each token
        for token in message_as_token_lst:
            if token in token_counter:
                token_counter[token] += 1
            else:
                token_counter[token] = 1

len(token_counter)


144709

In [12]:
token_counter

{'blue': 228,
 'horseshoe': 3,
 'meet': 897,
 'dear': 1137,
 'reader': 878,
 'sometimes': 434,
 'approach': 2218,
 'analyst': 1128,
 'thought': 1316,
 'emerging': 216,
 'market': 3782,
 'sector': 275,
 'interested': 2324,
 'certain': 1138,
 'occasion': 159,
 'come': 3012,
 'u': 13191,
 'intriguing': 49,
 'insight': 264,
 'aspect': 1778,
 'caught': 105,
 'attention': 853,
 'know': 6021,
 'track': 776,
 'record': 1079,
 'speaks': 80,
 'happy': 751,
 'bring': 907,
 'another': 2189,
 'situation': 893,
 'huge': 608,
 'upside': 52,
 'potential': 1175,
 'think': 3385,
 'could': 5083,
 'one': 14148,
 'look': 2802,
 'back': 3019,
 'shortly': 175,
 'everyone': 1038,
 'saying': 530,
 'info': 1714,
 'click': 3821,
 'remember': 1191,
 'nothing': 1158,
 'ventured': 7,
 'gained': 144,
 'national': 1589,
 'association': 1114,
 'honesty': 56,
 'medicine': 191,
 'stemcaoiwz': 1,
 'body': 833,
 'many': 4857,
 'cry': 65,
 'water': 445,
 'dr': 1865,
 'f': 2039,
 'batmanghelidj': 2,
 'title': 2118,
 'multi'

In [13]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    return token_counter[proccessed_token] > threshold

keep_token('random', 100)

True

In [14]:
features = set()

for token in token_counter:
  if keep_token(token, 1000):
    features.add(token)

features

{'0',
 '00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '07',
 '08',
 '09',
 '1',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '2',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '35',
 '3d',
 '4',
 '40',
 '44',
 '45',
 '49',
 '5',
 '50',
 '500',
 '6',
 '60',
 '7',
 '8',
 '9',
 '90',
 '95',
 '97',
 '98',
 '99',
 '_',
 'able',
 'abstract',
 'ac',
 'accepted',
 'access',
 'account',
 'acquisition',
 'act',
 'action',
 'actually',
 'ad',
 'add',
 'additional',
 'address',
 'ago',
 'agreement',
 'already',
 'also',
 'although',
 'always',
 'america',
 'american',
 'among',
 'amount',
 'analysis',
 'analyst',
 'announcement',
 'another',
 'answer',
 'anyone',
 'anything',
 'application',
 'applied',
 'approach',
 'april',
 'area',
 'argument',
 'around',
 'article',
 'ask',
 'asked',
 'aspect',
 'asset',
 'association',
 'attached',
 

In [15]:
features = list(features)
features

['general',
 'bill',
 'vowel',
 'unsubscribe',
 'including',
 'approach',
 'package',
 'john',
 'phone',
 '13',
 'cognitive',
 'linguist',
 'school',
 'less',
 'person',
 'cover',
 'used',
 'link',
 'financial',
 'programme',
 'law',
 'type',
 'code',
 'uk',
 'back',
 'top',
 'cannot',
 'france',
 '99',
 'development',
 'application',
 'g',
 'long',
 'month',
 'movement',
 'difference',
 'clear',
 'tel',
 'paid',
 'cash',
 'included',
 'tuesday',
 'html',
 'discourse',
 'hundred',
 'mail',
 'ilug',
 'south',
 'speech',
 'investor',
 'customer',
 'particular',
 'sourceforge',
 'run',
 'ac',
 'personal',
 'search',
 'tutorial',
 'spamassassin',
 'market',
 'go',
 'short',
 '2002',
 'k',
 'submission',
 'full',
 'form',
 'set',
 'billion',
 'software',
 'family',
 'question',
 'sure',
 'latest',
 '40',
 'user',
 'sign',
 'get',
 'hard',
 'doe',
 'thank',
 'website',
 'topic',
 'style',
 'received',
 'reading',
 'ad',
 'loss',
 'week',
 'plus',
 '29',
 'price',
 'card',
 '27',
 '24',
 'pre

In [16]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'general': 0,
 'bill': 1,
 'vowel': 2,
 'unsubscribe': 3,
 'including': 4,
 'approach': 5,
 'package': 6,
 'john': 7,
 'phone': 8,
 '13': 9,
 'cognitive': 10,
 'linguist': 11,
 'school': 12,
 'less': 13,
 'person': 14,
 'cover': 15,
 'used': 16,
 'link': 17,
 'financial': 18,
 'programme': 19,
 'law': 20,
 'type': 21,
 'code': 22,
 'uk': 23,
 'back': 24,
 'top': 25,
 'cannot': 26,
 'france': 27,
 '99': 28,
 'development': 29,
 'application': 30,
 'g': 31,
 'long': 32,
 'month': 33,
 'movement': 34,
 'difference': 35,
 'clear': 36,
 'tel': 37,
 'paid': 38,
 'cash': 39,
 'included': 40,
 'tuesday': 41,
 'html': 42,
 'discourse': 43,
 'hundred': 44,
 'mail': 45,
 'ilug': 46,
 'south': 47,
 'speech': 48,
 'investor': 49,
 'customer': 50,
 'particular': 51,
 'sourceforge': 52,
 'run': 53,
 'ac': 54,
 'personal': 55,
 'search': 56,
 'tutorial': 57,
 'spamassassin': 58,
 'market': 59,
 'go': 60,
 'short': 61,
 '2002': 62,
 'k': 63,
 'submission': 64,
 'full': 65,
 'form': 66,
 'set': 67,
 'b

In [17]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [18]:
# "Bag of Words" (counts vector)

# ->  http  tr  size  3d  font  br  com  td   p   b
# ->    0    1    2    3   4    5    6    7   8   9
# ->   [0,   0,   0,   1,  2,   1,   2,   0,  0,  1]

[0.,  0.,  0.,   1., 2.,  1., 2.,  0., 0., 1.]

[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0]

In [19]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1

  return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [20]:
message_to_count_vector(train_df['MESSAGE'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [21]:
train_df.iloc[3]

SERIAL                                                  15029
MESSAGE     reimbursement of individually billed items the...
CATEGORY                                                    0
Name: 3, dtype: object

In [22]:
def df_to_X_y(dff):
  y = dff['CATEGORY'].to_numpy().astype(int)

  message_col = dff['MESSAGE']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [23]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14907, 802), (14907,), (3727, 802), (3727,))

In [24]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00025497,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00076492,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00241546, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [25]:
# **TF-IDF Feature Extraction**

from sklearn.feature_extraction.text import TfidfVectorizer

def custom_tokenizer(message):
    return message_to_token_list(message)

tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['MESSAGE'])

X_test_tfidf = tfidf_vectorizer.transform(test_df['MESSAGE'])

y_train_tfidf = train_df['CATEGORY'].to_numpy().astype(int)
y_test_tfidf = test_df['CATEGORY'].to_numpy().astype(int)


feature_names = tfidf_vectorizer.get_feature_names_out()

X_train_tfidf_dense = X_train_tfidf.toarray()

# View the dense matrix (first 5 rows, for example)
print(X_train_tfidf_dense[:1])





[[0. 0. 0. ... 0. 0. 0.]]


In [26]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import classification_report

# Prepare tokenized sentences from the training data
train_sentences = [message_to_token_list(msg) for msg in train_df['MESSAGE'] if isinstance(msg, str)]
test_sentences = [message_to_token_list(msg) for msg in test_df['MESSAGE'] if isinstance(msg, str)]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4, seed=1)

# Function to convert a message to a Word2Vec vector (averaged over all tokens)
def message_to_wv_vector(message, model, vector_size):
    tokens = message_to_token_list(message)
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if len(vectors) == 0:  # If no tokens are found in the model, return a zero vector
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

# Extract Word2Vec features for training and test sets
vector_size = word2vec_model.vector_size
wv_X_train = np.array([message_to_wv_vector(msg, word2vec_model, vector_size) for msg in train_df['MESSAGE']])
wv_X_test = np.array([message_to_wv_vector(msg, word2vec_model, vector_size) for msg in test_df['MESSAGE']])

# Labels remain the same
wv_y_train = train_df['CATEGORY'].to_numpy().astype(int)
wv_y_test = test_df['CATEGORY'].to_numpy().astype(int)

# Verify the shape of the feature matrices
print("Word Vector Feature Matrix Shapes:")
print("Training Set:", wv_X_train.shape)
print("Test Set:", wv_X_test.shape)

Word Vector Feature Matrix Shapes:
Training Set: (14907, 100)
Test Set: (3727, 100)


In [27]:
import numpy as np
from scipy.sparse import hstack

# Combine TF-IDF and Word Vectors into a hybrid feature
hybrid_X_train = hstack([X_train_tfidf, wv_X_train])
hybrid_X_test = hstack([X_test_tfidf, wv_X_test])

# Ensure the data is in dense format if needed for certain classifiers
hybrid_X_train = hybrid_X_train.toarray()
hybrid_X_test = hybrid_X_test.toarray()

print(f"Hybrid Train Feature Shape: {hybrid_X_train.shape}")
print(f"Hybrid Test Feature Shape: {hybrid_X_test.shape}")


Hybrid Train Feature Shape: (14907, 144809)
Hybrid Test Feature Shape: (3727, 144809)


#### Bag of Words

In [28]:
# Logistic Regression using Bag of Words Feature Extraction
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test), digits=4))


              precision    recall  f1-score   support

           0     0.6209    0.9956    0.7648      2265
           1     0.8947    0.0581    0.1092      1462

    accuracy                         0.6279      3727
   macro avg     0.7578    0.5269    0.4370      3727
weighted avg     0.7283    0.6279    0.5076      3727



In [29]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test), digits=4))


              precision    recall  f1-score   support

           0     0.9690    0.9532    0.9611      2265
           1     0.9293    0.9528    0.9409      1462

    accuracy                         0.9530      3727
   macro avg     0.9492    0.9530    0.9510      3727
weighted avg     0.9534    0.9530    0.9531      3727



In [30]:
# Import necessary libraries
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_bow = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_bow.fit(X_train, y_train)

xgb_predictions_bow = xgb_bow.predict(X_test)

print(classification_report(y_test, xgb_predictions_bow, digits=4))



Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0     0.9834    0.9413    0.9619      2265
           1     0.9147    0.9754    0.9441      1462

    accuracy                         0.9547      3727
   macro avg     0.9490    0.9583    0.9530      3727
weighted avg     0.9564    0.9547    0.9549      3727



#### TF_IDF EXTRACTION

In [31]:
# **Logistic Regression Training and Evaluation**

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train_tfidf)

lr_predictions = lr_model.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, lr_predictions, digits=4))



              precision    recall  f1-score   support

           0     0.9820    0.9651    0.9735      2265
           1     0.9474    0.9726    0.9598      1462

    accuracy                         0.9681      3727
   macro avg     0.9647    0.9689    0.9667      3727
weighted avg     0.9684    0.9681    0.9681      3727



In [32]:
# **Random Forest Training and Evaluation**

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train_tfidf)

rf_predictions = rf_model.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, rf_predictions, digits=4))



              precision    recall  f1-score   support

           0     0.9711    0.9660    0.9686      2265
           1     0.9478    0.9555    0.9516      1462

    accuracy                         0.9619      3727
   macro avg     0.9595    0.9608    0.9601      3727
weighted avg     0.9620    0.9619    0.9619      3727



In [33]:
# Import the XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train_tfidf, y_train_tfidf)

xgb_predictions = xgb_model.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, xgb_predictions, digits=4))



Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0     0.9854    0.9519    0.9683      2265
           1     0.9292    0.9781    0.9530      1462

    accuracy                         0.9622      3727
   macro avg     0.9573    0.9650    0.9607      3727
weighted avg     0.9633    0.9622    0.9623      3727



#### Word Vector

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

wv_lr_model = LogisticRegression().fit(wv_X_train, wv_y_train)
wv_predictions = wv_lr_model.predict(wv_X_test)

print(classification_report(wv_y_test, wv_predictions, digits=4))


              precision    recall  f1-score   support

           0     0.9593    0.9475    0.9534      2265
           1     0.9201    0.9378    0.9289      1462

    accuracy                         0.9437      3727
   macro avg     0.9397    0.9426    0.9411      3727
weighted avg     0.9439    0.9437    0.9437      3727



In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_wv_model = RandomForestClassifier(random_state=1)
rf_wv_model.fit(wv_X_train, wv_y_train)

rf_wv_predictions = rf_wv_model.predict(wv_X_test)

print(classification_report(wv_y_test, rf_wv_predictions, digits=4))


              precision    recall  f1-score   support

           0     0.9718    0.9572    0.9644      2265
           1     0.9352    0.9569    0.9459      1462

    accuracy                         0.9571      3727
   macro avg     0.9535    0.9570    0.9552      3727
weighted avg     0.9574    0.9571    0.9572      3727



In [36]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_wv_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1)
xgb_wv_model.fit(wv_X_train, wv_y_train)

xgb_wv_predictions = xgb_wv_model.predict(wv_X_test)

print(classification_report(wv_y_test, xgb_wv_predictions, digits=4))


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0     0.9745    0.9611    0.9678      2265
           1     0.9411    0.9610    0.9509      1462

    accuracy                         0.9611      3727
   macro avg     0.9578    0.9611    0.9594      3727
weighted avg     0.9614    0.9611    0.9612      3727



#### Hybrid

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr_hybrid_model = LogisticRegression()
lr_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

lr_hybrid_predictions = lr_hybrid_model.predict(hybrid_X_test)
print(classification_report(y_test_tfidf, lr_hybrid_predictions, digits=4))


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_hybrid_model = RandomForestClassifier()
rf_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

rf_hybrid_predictions = rf_hybrid_model.predict(hybrid_X_test)
print(classification_report(y_test_tfidf, rf_hybrid_predictions, digits=4))


              precision    recall  f1-score   support

           0     0.9724    0.9647    0.9685      2265
           1     0.9459    0.9576    0.9517      1462

    accuracy                         0.9619      3727
   macro avg     0.9592    0.9611    0.9601      3727
weighted avg     0.9620    0.9619    0.9619      3727



In [57]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_hybrid_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

xgb_hybrid_predictions = xgb_hybrid_model.predict(hybrid_X_test)
print("XGBoost Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, xgb_hybrid_predictions, digits=4))


Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0     0.9829    0.9642    0.9735      2265
           1     0.9462    0.9740    0.9599      1462

    accuracy                         0.9681      3727
   macro avg     0.9645    0.9691    0.9667      3727
weighted avg     0.9685    0.9681    0.9681      3727



In [31]:
import numpy as np
from tensorflow.keras.utils import Sequence

# Define a custom data generator
class ChunkedDataGenerator(Sequence):
    def __init__(self, X_data, y_data, batch_size, chunk_size):
        self.X_data = X_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.chunk_size = chunk_size
        self.num_chunks = len(X_data) // chunk_size
        
    def __len__(self):
        return self.num_chunks
    
    def __getitem__(self, index):
        # Define chunk indices
        start_idx = index * self.chunk_size
        end_idx = (index + 1) * self.chunk_size
        
        # Load a chunk of the data
        X_chunk = self.X_data[start_idx:end_idx]
        y_chunk = self.y_data[start_idx:end_idx]
        
        # Reshape the chunk for CNN (if needed)
        X_chunk_reshaped = X_chunk.reshape(X_chunk.shape[0], X_chunk.shape[1], 1)
        
        return X_chunk_reshaped, y_chunk
    
    def on_epoch_end(self):
        # Optionally shuffle the dataset at the end of each epoch
        pass

# Parameters
chunk_size = 1000  # Adjust this based on your memory constraints
batch_size = 32

# Initialize the data generator for training and testing
train_generator = ChunkedDataGenerator(hybrid_X_train, y_train_cnn, batch_size, chunk_size)
test_generator = ChunkedDataGenerator(hybrid_X_test, y_test_cnn, batch_size, chunk_size)

# Define and train the model
cnn_hybrid_model = build_cnn_model(hybrid_X_train.shape[1:])
cnn_hybrid_model.fit(train_generator, epochs=10, validation_data=test_generator)

# Evaluate the model
loss, accuracy = cnn_hybrid_model.evaluate(test_generator)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


MemoryError: Unable to allocate 8.04 GiB for an array with shape (14907, 144809, 1) and data type float32