<a href="https://colab.research.google.com/github/Hemanthtak2000/22073706_Project_UoH/blob/main/Code_research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install packages that needed to execute the code
!pip install -q transformers datasets evaluate

In [None]:
# Import the libraries needed to execute the code
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
Email_Lem = WordNetLemmatizer()

# Load English stopwords
Email_StopW = set(stopwords.words('english'))

def Clean_the_textData(text):
    """
    Clean and preprocess a given text string.

    Steps performed:
    1. Convert text to lowercase.
    2. Remove all punctuation and numeric characters.
    3. Tokenize the cleaned text into words.
    4. Remove English stopwords (e.g., 'the', 'is', 'and').
    5. Lemmatize each word to its base form.
    6. Rejoin the processed words into a single string.

    Parameters:
    text : Raw input text.

    Returns: Cleaned and preprocessed text string.
    """
    tolowercase = text.lower()
    toremoveAl = re.sub(r'[^a-z\s]', '', tolowercase)
    Toremovetokens = toremoveAl.split()
    toLemmatize = [Email_Lem.lemmatize(word) for word in Toremovetokens if word not in Email_StopW]
    return ' '.join(toLemmatize)


In [None]:
# Load dataset from CSV file
Email_Spam_Data = pd.read_csv("https://raw.githubusercontent.com/Hemanthtak2000/ResearchAssignment/refs/heads/main/Job_Dataset.csv")

In [None]:
# Display first 5 rows of the dataset
print(Email_Spam_Data.head())

In [None]:
# Show column data types and non-null counts
Email_Spam_Data.info()

In [None]:
# Rename columns: assume the current columns are something like 'fraudulent' and 'description'
Email_Spam_Data = Email_Spam_Data.rename(columns={
    'fraudulent': 'label',
    'Job_Desc': 'text'
})
print(Email_Spam_Data.columns)

In [None]:
# Show column data types and non-null counts
Email_Spam_Data.info()

In [None]:
# Show stats for numerical columns
Email_Spam_Data.describe()

In [None]:
# Print dataset shape to know the how many columns and rows are these.
Email_df_rows, Email_df_columns = Email_Spam_Data.shape
print(f"The shape of the original dataset is {Email_df_rows} reviews with {Email_df_columns} columns.")

In [None]:
# Show null value count per column
Email_Spam_Data.isna().sum()

In [None]:
# Add a new column with the length of each comment
Email_Spam_Data['Email_Len'] = Email_Spam_Data['text'].apply(len)

In [None]:
# Apply preprocessing to text column
Email_Spam_Data['Email_Cleaned'] = Email_Spam_Data['text'].apply(Clean_the_textData)

# Show the difference between original and cleaned text columns
Email_Spam_Data[['text', 'Email_Cleaned']]


In [None]:
# Check for duplicate rows in the dataset
print(f"Number of duplicate rows in the Email Dataset : {Email_Spam_Data.duplicated().sum()}")

In [None]:
# Show column names after preprocessing
Email_Spam_Data.columns

In [None]:
# Display first 5 rows of the updated dataset
Email_Spam_Data.head()

In [None]:
# Check and display class label distribution
Cnt_Tgt = Email_Spam_Data['label'].value_counts()
print("Label Distribution:\n", Cnt_Tgt)

In [None]:
# Plot the count of class labels
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=Email_Spam_Data)
plt.title("Email Classification Label Count")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

In [None]:
# Plot histogram of text lengths
plt.figure(figsize=(8, 5))
sns.histplot(Email_Spam_Data['Email_Len'], bins=30, kde=True)
plt.title("Distribution of Text Lengths of Email Data")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Generate WordCloud for each class label
for tgt in sorted(Email_Spam_Data['label'].unique()):
    EM_txt = ' '.join(Email_Spam_Data[Email_Spam_Data['label'] == tgt]['Email_Cleaned'])
    Gen_Wrdcld = WordCloud(width=800, height=400, background_color='white').generate(EM_txt)
    plt.figure(figsize=(10, 5))
    plt.imshow(Gen_Wrdcld)
    plt.axis('off')
    plt.title(f"WordCloud for Label {tgt}")
    plt.show()

In [None]:
# Specify columns for correlation analysis
Num_Col_Email = ['label', 'Email_Len']
# Compute correlation matrix
Cormat_EM = Email_Spam_Data[Num_Col_Email].corr()
# Plot heatmap of correlations
sns.heatmap(Cormat_EM, annot=True)
# Add title to the heatmap
plt.title('Correlation Heatmap of Label and Text Length')
plt.show()


In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
majority_class = Email_Spam_Data[Email_Spam_Data['label'] == 0]
minority_class = Email_Spam_Data[Email_Spam_Data['label'] == 1]

# Upsample minority class to match majority class
minority_upsampled = resample(
    minority_class,
    replace=True,
    n_samples=len(majority_class),
    random_state=42
)

# Combine majority and upsampled minority class
Email_Spam_Data_balanced = pd.concat([majority_class, minority_upsampled])

# Shuffle the dataset
Email_Spam_Data_balanced = Email_Spam_Data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
# Plot label distribution after oversampling
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=Email_Spam_Data_balanced)
plt.title("Label Distribution After Oversampling")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()


In [None]:
# Split dataset into training and testing sets with stratified labels
Email_Train_df, Email_Test_df = train_test_split(
    Email_Spam_Data, test_size=0.2, random_state=2025, stratify=Email_Spam_Data['label']
)

In [None]:
# Convert train and test DataFrames to HuggingFace Dataset format
Em_Train_DS = Dataset.from_pandas(Email_Train_df)
Em_Test_DS = Dataset.from_pandas(Email_Test_df)

In [None]:
# Load BERT tokenizer
Chosen_model_EM = "bert-base-uncased"
Em_Bert_Token = AutoTokenizer.from_pretrained(Chosen_model_EM)

In [None]:
# Tokenization function for HuggingFace datasets
def fun_Token_EM(data):
    """
    Tokenizes input text using the loaded tokenizer.Applies padding and
    truncation to ensure uniform input length for transformer models
    like BERT or RoBERTa.

    Parameters:
    data: A batch of examples with a "text" field.

    Returns: Tokenized output with input IDs, attention masks, etc.
    """
    return Em_Bert_Token(data["text"], padding="max_length", truncation=True)


# Apply tokenization to train and test datasets
Em_Train_DS = Em_Train_DS.map(fun_Token_EM, batched=True)
Em_Test_DS = Em_Test_DS.map(fun_Token_EM, batched=True)

In [None]:
# Load pre-trained model for sequence classification with 2 output labels
Cust_Model_EM = AutoModelForSequenceClassification.from_pretrained(Chosen_model_EM, num_labels=2)

In [None]:
# Set training parameters for the model
Em_Btch_Size = 64
EM_Log_Steps = len(Em_Train_DS) // Em_Btch_Size
# Get the model name without path prefix
Chosen_model_EM = Chosen_model_EM.split("/")[-1]

In [None]:
# Define training arguments for model fine-tuning
EM_Trn_Args = TrainingArguments(
    output_dir=f"{Chosen_model_EM}-finetuned-custom", overwrite_output_dir=True,
    learning_rate=2e-5, weight_decay=0.01,per_device_train_batch_size=Em_Btch_Size,
    per_device_eval_batch_size=Em_Btch_Size, push_to_hub=False,
    fp16=torch.cuda.is_available(),logging_steps=EM_Log_Steps,report_to="none")

In [None]:
# Load accuracy metric from HuggingFace evaluate library
Acc_Met_EM = evaluate.load("accuracy")

# Define function to compute evaluation metrics
def Cal_Metrics_EM(eval_pred):
    """
    Computes accuracy metric for model evaluation.

    Parameters:
    eval_pred : A tuple containing logits of Raw model predictions and
    labels of True labels.

    Returns: Dictionary containing accuracy score.
    """
    EM_Logits, EM_labels = eval_pred
    EM_preds = torch.argmax(torch.tensor(EM_Logits), dim=-1)
    return Acc_Met_EM.compute(predictions=EM_preds, references=EM_labels)

In [None]:
# Train the model using HuggingFace Trainer API
EM_Trainer_det = Trainer(
    model=Cust_Model_EM,
    args=EM_Trn_Args,
    train_dataset=Em_Train_DS,
    eval_dataset=Em_Test_DS,
    compute_metrics=Cal_Metrics_EM
)
# Start training
EM_Trainer_det.train()

In [None]:
# Evaluate the trained model on the test set
EM_Model_Eval = EM_Trainer_det.evaluate()
# Print evaluation metrics of the model on Email Spam data
print("Evaluation Results of the Model on Email Spam Data :")
for key, value in EM_Model_Eval.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Get predictions from the model on test data
EM_Model_Pred = EM_Trainer_det.predict(Em_Test_DS)
Org_Labels = EM_Model_Pred.label_ids
EM_Pred_Labels = torch.argmax(torch.tensor(EM_Model_Pred.predictions), axis=1).numpy()
# Compute and display confusion matrix
Eval_EM_ConfMat = confusion_matrix(Org_Labels, EM_Pred_Labels)
Disp_EM_Confmat = ConfusionMatrixDisplay(confusion_matrix=Eval_EM_ConfMat)
Disp_EM_Confmat.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix for the Email Spam Detection Evaluation")
plt.show()


In [None]:
# Get raw text from the original test dataset
EM_testing_Samples = Em_Test_DS["text"]
# Display 5 prediction samples with their true and predicted labels
print("\nPredicting the Lables with the Trained Models on Test Data Samples:\n")
for i in range(5):
    print(f"Text: {EM_testing_Samples[i]}")
    print(f"Predicted Label : {EM_Pred_Labels[i]}, True Label: {Org_Labels[i]}")
    print("-" * 60)
