In [4]:
# Load the dataset (https://www.kaggle.com/chandramoulinaidu/spam-classification-for-basic-nlp)

import pandas as pd

df = pd.read_csv('Phishing_Email.csv')

df.head()

Unnamed: 0,SERIAL,MESSAGE,CATEGORY
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,1,the other side of * galicismos * * galicismo *...,0
2,2,re : equistar deal tickets are you still avail...,0
3,3,\r\nHello I am your hot lil horny toy.\r\n ...,1
4,4,software at incredibly low prices ( 86 % lower...,1


In [5]:
df.tail()

Unnamed: 0,SERIAL,MESSAGE,CATEGORY
18629,18646,date a lonely housewife always wanted to date ...,1
18630,18647,request submitted : access request for anita ....,0
18631,18648,"re : important - prc mtg hi dorn & john , as y...",0
18632,18649,press clippings - letter on californian utilit...,0
18633,18650,empty,1


In [6]:
df['CATEGORY'].value_counts()

CATEGORY
0    11322
1     7312
Name: count, dtype: int64

In [7]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [9]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [10]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [11]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [12]:
def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [13]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8)
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(       SERIAL                                            MESSAGE  CATEGORY
 0       13454  fw : priority customer list this is not going ...         0
 1       14141  On Tue, 30 Jul 2002 22:22:24 +0200, "Manfred G...         0
 2       17106  re : [ penetrative ] 86 % - off vicodin . dono...         1
 3       13497  btu ' s daily power report - eastern edition a...         0
 4       17348  On Tue, 27 Aug 2002, David Neary wrote:> > Act...         0
 ...       ...                                                ...       ...
 14902    8884  -----Original Message----- > From: Aherne Pete...         0
 14903    7018  re : credit trading brought to you by bryan se...         0
 14904    9022  review of perez - leroux & glass ( ed ) contem...         0
 14905    6925  enron actuals for july 7 thru 9 , 2000 july 7 ...         0
 14906   13046  re : thursday visit frank , we shall have abou...         0
 
 [14907 rows x 3 columns],
       SERIAL                                            ME

In [14]:
import numpy as np

token_counter = {}

# Iterate over each message in the 'MESSAGE' column
for message in train_df['MESSAGE']:
    if isinstance(message, str):  # Check if message is a string
        message_as_token_lst = message_to_token_list(message)  # Convert message to a list of tokens

        # Count occurrences of each token
        for token in message_as_token_lst:
            if token in token_counter:
                token_counter[token] += 1
            else:
                token_counter[token] = 1

len(token_counter)


140862

In [15]:
token_counter

{'fw': 582,
 'priority': 314,
 'customer': 2086,
 'list': 11436,
 'going': 2023,
 'work': 7854,
 'want': 4565,
 'control': 1358,
 'like': 7945,
 'used': 3240,
 'enron': 16368,
 'please': 11825,
 'let': 3215,
 'know': 6078,
 'bill': 1491,
 'original': 2557,
 'message': 6095,
 'forster': 66,
 'david': 1896,
 'sent': 4127,
 'friday': 1589,
 'february': 1084,
 '01': 3184,
 '2002': 3521,
 '2': 15186,
 '53': 299,
 'pm': 2841,
 'bradford': 88,
 'william': 556,
 'brackett': 21,
 'debbie': 70,
 'r': 4269,
 'pat': 206,
 'odonnell': 8,
 'ubsw': 35,
 'com': 15812,
 'louis': 175,
 'eber': 11,
 'glass': 91,
 'colette': 31,
 'dow': 1108,
 'cc': 2819,
 'kitchen': 555,
 'louise': 934,
 'subject': 11443,
 'attached': 1198,
 'intended': 853,
 'pre': 1038,
 'approved': 581,
 'company': 10046,
 'ready': 1219,
 'launch': 211,
 'day': 7004,
 'includes': 1150,
 'u': 13125,
 'canadian': 376,
 'entity': 411,
 'gas': 2959,
 'power': 3024,
 'recently': 892,
 'told': 849,
 'individual': 1690,
 'legal': 1389,
 'nam

In [16]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    return token_counter[proccessed_token] > threshold

keep_token('random', 100)

True

In [17]:
features = set()

for token in token_counter:
  if keep_token(token, 1000):
    features.add(token)

features

{'0',
 '00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '08',
 '09',
 '1',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '2',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '35',
 '3d',
 '4',
 '40',
 '44',
 '45',
 '49',
 '5',
 '50',
 '500',
 '6',
 '60',
 '7',
 '8',
 '9',
 '90',
 '95',
 '97',
 '98',
 '99',
 '_',
 'able',
 'abstract',
 'ac',
 'accepted',
 'access',
 'account',
 'acquisition',
 'act',
 'action',
 'actually',
 'ad',
 'add',
 'additional',
 'address',
 'ago',
 'agreement',
 'already',
 'also',
 'although',
 'always',
 'america',
 'american',
 'among',
 'amount',
 'analysis',
 'analyst',
 'announcement',
 'another',
 'answer',
 'anyone',
 'anything',
 'application',
 'applied',
 'approach',
 'april',
 'area',
 'argument',
 'around',
 'article',
 'ask',
 'asked',
 'aspect',
 'asset',
 'association',
 'attached',
 'august

In [18]:
features = list(features)
features

['away',
 'bulk',
 'learn',
 'volume',
 'week',
 'august',
 'sentence',
 '10',
 'internet',
 'direct',
 'package',
 'certain',
 '13',
 'native',
 'enough',
 'subscription',
 'paul',
 'sourceforge',
 'applied',
 '4',
 'never',
 'news',
 'total',
 'day',
 'multi',
 '9',
 'dollar',
 'michael',
 'society',
 'proceeding',
 '00',
 'remove',
 'month',
 'save',
 'rpm',
 'matter',
 'number',
 'many',
 'pm',
 'mailing',
 'discourse',
 'discussion',
 'session',
 'london',
 'capital',
 'people',
 'status',
 'section',
 'transaction',
 'speech',
 'life',
 'tell',
 'said',
 'go',
 'fee',
 'february',
 '2002',
 'among',
 'st',
 'proposal',
 'including',
 'object',
 'social',
 'course',
 'due',
 'regarding',
 'context',
 'org',
 'inc',
 'book',
 'based',
 'want',
 'web',
 'market',
 'david',
 'account',
 'keep',
 'database',
 'believe',
 'email',
 'ï',
 'immediately',
 'reference',
 'microsoft',
 'specific',
 'click',
 'grant',
 'whole',
 'loss',
 'department',
 'claim',
 '22',
 'three',
 'john',
 'â'

In [19]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'away': 0,
 'bulk': 1,
 'learn': 2,
 'volume': 3,
 'week': 4,
 'august': 5,
 'sentence': 6,
 '10': 7,
 'internet': 8,
 'direct': 9,
 'package': 10,
 'certain': 11,
 '13': 12,
 'native': 13,
 'enough': 14,
 'subscription': 15,
 'paul': 16,
 'sourceforge': 17,
 'applied': 18,
 '4': 19,
 'never': 20,
 'news': 21,
 'total': 22,
 'day': 23,
 'multi': 24,
 '9': 25,
 'dollar': 26,
 'michael': 27,
 'society': 28,
 'proceeding': 29,
 '00': 30,
 'remove': 31,
 'month': 32,
 'save': 33,
 'rpm': 34,
 'matter': 35,
 'number': 36,
 'many': 37,
 'pm': 38,
 'mailing': 39,
 'discourse': 40,
 'discussion': 41,
 'session': 42,
 'london': 43,
 'capital': 44,
 'people': 45,
 'status': 46,
 'section': 47,
 'transaction': 48,
 'speech': 49,
 'life': 50,
 'tell': 51,
 'said': 52,
 'go': 53,
 'fee': 54,
 'february': 55,
 '2002': 56,
 'among': 57,
 'st': 58,
 'proposal': 59,
 'including': 60,
 'object': 61,
 'social': 62,
 'course': 63,
 'due': 64,
 'regarding': 65,
 'context': 66,
 'org': 67,
 'inc': 68,
 'bo

In [20]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [21]:
# "Bag of Words" (counts vector)

# ->  http  tr  size  3d  font  br  com  td   p   b
# ->    0    1    2    3   4    5    6    7   8   9
# ->   [0,   0,   0,   1,  2,   1,   2,   0,  0,  1]

[0.,  0.,  0.,   1., 2.,  1., 2.,  0., 0., 1.]

[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0]

In [22]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1

  return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [23]:
message_to_count_vector(train_df['MESSAGE'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1.

In [24]:
train_df.iloc[3]

SERIAL                                                  13497
MESSAGE     btu ' s daily power report - eastern edition a...
CATEGORY                                                    0
Name: 3, dtype: object

In [25]:
def df_to_X_y(dff):
  y = dff['CATEGORY'].to_numpy().astype(int)

  message_col = dff['MESSAGE']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [26]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14907, 813), (14907,), (3727, 813), (3727,))

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.00110132, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [28]:
# **TF-IDF Feature Extraction**

from sklearn.feature_extraction.text import TfidfVectorizer

def custom_tokenizer(message):
    return message_to_token_list(message)

tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['MESSAGE'])

X_test_tfidf = tfidf_vectorizer.transform(test_df['MESSAGE'])

y_train_tfidf = train_df['CATEGORY'].to_numpy().astype(int)
y_test_tfidf = test_df['CATEGORY'].to_numpy().astype(int)


feature_names = tfidf_vectorizer.get_feature_names_out()

X_train_tfidf_dense = X_train_tfidf.toarray()

# View the dense matrix (first 5 rows, for example)
print(X_train_tfidf_dense[:1])





[[0. 0. 0. ... 0. 0. 0.]]


In [29]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import classification_report

# Prepare tokenized sentences from the training data
train_sentences = [message_to_token_list(msg) for msg in train_df['MESSAGE'] if isinstance(msg, str)]
test_sentences = [message_to_token_list(msg) for msg in test_df['MESSAGE'] if isinstance(msg, str)]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4, seed=1)

# Function to convert a message to a Word2Vec vector (averaged over all tokens)
def message_to_wv_vector(message, model, vector_size):
    tokens = message_to_token_list(message)
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if len(vectors) == 0:  # If no tokens are found in the model, return a zero vector
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

# Extract Word2Vec features for training and test sets
vector_size = word2vec_model.vector_size
wv_X_train = np.array([message_to_wv_vector(msg, word2vec_model, vector_size) for msg in train_df['MESSAGE']])
wv_X_test = np.array([message_to_wv_vector(msg, word2vec_model, vector_size) for msg in test_df['MESSAGE']])

# Labels remain the same
wv_y_train = train_df['CATEGORY'].to_numpy().astype(int)
wv_y_test = test_df['CATEGORY'].to_numpy().astype(int)

# Verify the shape of the feature matrices
print("Word Vector Feature Matrix Shapes:")
print("Training Set:", wv_X_train.shape)
print("Test Set:", wv_X_test.shape)

Word Vector Feature Matrix Shapes:
Training Set: (14907, 100)
Test Set: (3727, 100)


In [30]:
import numpy as np
from scipy.sparse import hstack

# Combine TF-IDF and Word Vectors into a hybrid feature
hybrid_X_train = hstack([X_train_tfidf, wv_X_train])
hybrid_X_test = hstack([X_test_tfidf, wv_X_test])

# Ensure the data is in dense format if needed for certain classifiers
hybrid_X_train = hybrid_X_train.toarray()
hybrid_X_test = hybrid_X_test.toarray()

print(f"Hybrid Train Feature Shape: {hybrid_X_train.shape}")
print(f"Hybrid Test Feature Shape: {hybrid_X_test.shape}")


Hybrid Train Feature Shape: (14907, 140962)
Hybrid Test Feature Shape: (3727, 140962)


#### Bag of Words

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))


              precision    recall  f1-score   support

           0       0.63      0.99      0.77      2290
           1       0.86      0.05      0.10      1437

    accuracy                           0.63      3727
   macro avg       0.74      0.52      0.43      3727
weighted avg       0.72      0.63      0.51      3727



In [32]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2290
           1       0.93      0.95      0.94      1437

    accuracy                           0.95      3727
   macro avg       0.95      0.95      0.95      3727
weighted avg       0.95      0.95      0.95      3727



In [35]:
# Import necessary libraries
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_bow = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_bow.fit(X_train, y_train)

xgb_predictions_bow = xgb_bow.predict(X_test)

print("XGBoost with Bag of Words Classification Report:\n")
print(classification_report(y_test, xgb_predictions_bow))


Parameters: { "use_label_encoder" } are not used.



XGBoost with Bag of Words Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2290
           1       0.92      0.97      0.94      1437

    accuracy                           0.95      3727
   macro avg       0.95      0.96      0.95      3727
weighted avg       0.96      0.95      0.95      3727



#### TF_IDF EXTRACTION

In [36]:
# **Logistic Regression Training and Evaluation**

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Training Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train_tfidf)

# Predictions and evaluation
lr_predictions = lr_model.predict(X_test_tfidf)
print("Logistic Regression Classification Report:\n")
print(classification_report(y_test_tfidf, lr_predictions))


Training Logistic Regression...
Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2290
           1       0.94      0.98      0.96      1437

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727



In [37]:
# **Random Forest Training and Evaluation**

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Training Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train_tfidf)

# Predictions and evaluation
rf_predictions = rf_model.predict(X_test_tfidf)
print("Random Forest Classification Report:\n")
print(classification_report(y_test_tfidf, rf_predictions))


Training Random Forest...
Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2290
           1       0.95      0.96      0.96      1437

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727



In [38]:
# Import the XGBoost model
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train_tfidf, y_train_tfidf)

xgb_predictions = xgb_model.predict(X_test_tfidf)

print("XGBoost Classification Report:\n")
print(classification_report(y_test_tfidf, xgb_predictions))


Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      2290
           1       0.93      0.98      0.96      1437

    accuracy                           0.96      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.96      0.96      3727



#### Word Vector

In [39]:
from sklearn.linear_model import LogisticRegression

wv_lr_model = LogisticRegression().fit(wv_X_train, wv_y_train)
wv_predictions = wv_lr_model.predict(wv_X_test)

# Evaluate the performance
print("Logistic Regression with Word Vectors Classification Report:\n")
print(classification_report(wv_y_test, wv_predictions))

Logistic Regression with Word Vectors Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      2290
           1       0.92      0.95      0.94      1437

    accuracy                           0.95      3727
   macro avg       0.94      0.95      0.95      3727
weighted avg       0.95      0.95      0.95      3727



In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Random Forest Classifier on Word Vectors
print("Training Random Forest on Word Vectors...")
rf_wv_model = RandomForestClassifier(random_state=1)
rf_wv_model.fit(wv_X_train, wv_y_train)

rf_wv_predictions = rf_wv_model.predict(wv_X_test)

print("Random Forest with Word Vectors Classification Report:\n")
print(classification_report(wv_y_test, rf_wv_predictions))

Training Random Forest on Word Vectors...
Random Forest with Word Vectors Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2290
           1       0.94      0.95      0.95      1437

    accuracy                           0.96      3727
   macro avg       0.95      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [41]:
# XGBoost Classifier on Word Vectors
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

print("Training XGBoost on Word Vectors...")
xgb_wv_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1)
xgb_wv_model.fit(wv_X_train, wv_y_train)

xgb_wv_predictions = xgb_wv_model.predict(wv_X_test)

print("XGBoost with Word Vectors Classification Report:\n")
print(classification_report(wv_y_test, xgb_wv_predictions))

Training XGBoost on Word Vectors...


Parameters: { "use_label_encoder" } are not used.



XGBoost with Word Vectors Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2290
           1       0.94      0.97      0.95      1437

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



#### Hybrid

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression on Hybrid Features
lr_hybrid_model = LogisticRegression()
lr_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

# Predict and evaluate
lr_hybrid_predictions = lr_hybrid_model.predict(hybrid_X_test)
print("Logistic Regression Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, lr_hybrid_predictions))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2290
           1       0.94      0.97      0.95      1437

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [43]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest on Hybrid Features
rf_hybrid_model = RandomForestClassifier()
rf_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

# Predict and evaluate
rf_hybrid_predictions = rf_hybrid_model.predict(hybrid_X_test)
print("Random Forest Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, rf_hybrid_predictions))


Random Forest Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2290
           1       0.95      0.96      0.96      1437

    accuracy                           0.97      3727
   macro avg       0.97      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727



In [44]:
from xgboost import XGBClassifier

# Train XGBoost on Hybrid Features
xgb_hybrid_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_hybrid_model.fit(hybrid_X_train, y_train_tfidf)

# Predict and evaluate
xgb_hybrid_predictions = xgb_hybrid_model.predict(hybrid_X_test)
print("XGBoost Classification Report (Hybrid Features):\n")
print(classification_report(y_test_tfidf, xgb_hybrid_predictions))

Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report (Hybrid Features):

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2290
           1       0.95      0.98      0.97      1437

    accuracy                           0.97      3727
   macro avg       0.97      0.98      0.97      3727
weighted avg       0.98      0.97      0.97      3727

