<a href="https://colab.research.google.com/github/Hemanthtak2000/ResearchAssignment/blob/main/Code_research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install essential libraries for model training, dataset handling, and evaluation
!pip install -q transformers datasets evaluate

In [None]:
# Suppress specific warning types for cleaner output
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Import standard data manipulation and numerical libraries
import pandas as pd
import numpy as np
# Import text preprocessing utilities
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Visualization tools
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
# Sklearn tools for feature extraction, evaluation, and data splitting
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# PyTorch and Hugging Face libraries for model development and evaluation
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [None]:
# Download required NLTK resources for preprocessing
nltk.download('stopwords')
nltk.download('wordnet')

# Create a lemmatizer instance
text_lemmatizer = WordNetLemmatizer()

# Define a set of standard English stopwords
stopword_list = set(stopwords.words('english'))

def preprocess_text_input(text_input):
    """
    This function takes raw text as input and performs several preprocessing steps
    to clean and prepare the text for NLP tasks. It converts the input to lowercase,
    eliminates punctuation and numeric characters, splits the text into tokens,
    removes stopwords, and applies lemmatization to convert each word to its base form.
    Finally, the processed tokens are joined into a single cleaned string.

    Parameters:
    text_input : str
        The raw input string to preprocess.

    Returns:
    str
        A cleaned and lemmatized version of the original text.
    """
    lowercase_text = text_input.lower()
    text_without_symbols = re.sub(r'[^a-z\s]', '', lowercase_text)
    word_tokens = text_without_symbols.split()
    filtered_tokens = [text_lemmatizer.lemmatize(token) for token in word_tokens if token not in stopword_list]
    return ' '.join(filtered_tokens)

In [None]:
# Read the job-related dataset directly from a remote CSV URL
job_dataset_df = pd.read_csv("https://raw.githubusercontent.com/Hemanthtak2000/ResearchAssignment/refs/heads/main/Job_Dataset.csv")

In [None]:
# Preview the first few records in the job dataset
print(job_dataset_df.head())

In [None]:
# Display dataset structure including data types and missing value info
job_dataset_df.info()

In [None]:
# Rename specific columns for clarity: 'fraudulent' → 'label', 'Job_Desc' → 'text'
job_dataset_df = job_dataset_df.rename(columns={
    'fraudulent': 'label',
    'Job_Desc': 'text'
})
# Print updated column names to verify changes
print(job_dataset_df.columns)

In [None]:
# Check updated DataFrame structure and ensure there are no missing values
job_dataset_df.info()

In [None]:
# Generate summary statistics for numerical features in the dataset
job_dataset_df.describe()

In [None]:
# Display the total number of rows and columns in the dataset
numrow_jobdet, numcol_jobdet = job_dataset_df.shape
print(f"The shape of the original dataset is {numrow_jobdet} reviews with {numcol_jobdet} columns.")

In [None]:
# Count the number of missing values in each column of the dataset
job_dataset_df.isna().sum()

In [None]:
# Create a new column that stores the character count of each job description
job_dataset_df['len_of_desc'] = job_dataset_df['text'].apply(len)

In [None]:
# Apply the text cleaning function to each entry in the 'text' column
job_dataset_df['Processed_Desc'] = job_dataset_df['text'].apply(preprocess_text_input)
# Display original and preprocessed versions of the job descriptions
job_dataset_df[['text', 'Processed_Desc']]

In [None]:
# Identify and count any duplicate entries present in the dataset
print(f"Number of duplicate rows in the Email Dataset : {job_dataset_df.duplicated().sum()}")

In [None]:
# Display the column names available in the dataset after preprocessing
job_dataset_df.columns

In [None]:
# Preview the top 5 entries of the preprocessed dataset
job_dataset_df.head()

In [None]:
# Examine the distribution of class labels in the dataset
Jobdet_cnt = job_dataset_df['label'].value_counts()
print("Number of Records per Class:\n", Jobdet_cnt)

In [None]:
# Plotting the class distribution
plt.figure(figsize=(6, 4))
sns.barplot(x=Jobdet_cnt.index, y=Jobdet_cnt.values)
plt.title("Distribution of Job Classification Labels")
plt.xlabel("Label (0 = Real, 1 = Fraudulent)")
plt.ylabel("Number of Job Listings")
plt.xticks([0, 1])
plt.tight_layout()
plt.show()

In [None]:
# Visualize the distribution of job description lengths
plt.figure(figsize=(8, 5))
sns.histplot(job_dataset_df['len_of_desc'], bins=30, kde=True)
plt.title("Histogram of Job Description Lengths")
plt.xlabel("Length of Description (in Characters)")
plt.ylabel("Number of Job Listings")
plt.show()

In [None]:
# Select numerical columns to analyze correlation
Jobdesc_numcol = ['label', 'len_of_desc']
# Calculate correlation matrix between selected features
Jobdesc_CM = job_dataset_df[Jobdesc_numcol].corr()
# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(Jobdesc_CM, annot=True, cmap='Blues', fmt=".2f")
plt.title('Correlation Between Label and Description Length')
plt.show()

In [None]:
from sklearn.utils import resample
# Split the dataset into majority (label = 0) and minority (label = 1) classes
job_tgt_Major = job_dataset_df[job_dataset_df['label'] == 0]
job_tgt_Minor = job_dataset_df[job_dataset_df['label'] == 1]
# Perform upsampling on the minority class to balance the class distribution
job_tgt_Min_Up = resample(
    job_tgt_Minor,
    replace=True,
    n_samples=len(job_tgt_Major),
    random_state=1107
)
# Merge the upsampled minority class with the original majority class
Job_det_df_balanced = pd.concat([job_tgt_Major, job_tgt_Min_Up])
# Randomly shuffle the rows to mix class samples evenly
Job_det_df_balanced = Job_det_df_balanced.sample(frac=1, random_state=1107).reset_index(drop=True)

In [None]:
# Visualize the class label distribution after applying oversampling
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=Job_det_df_balanced)
plt.title("Balanced Class Distribution After Upsampling")
plt.xlabel("Class Label (0 = Real, 1 = Fraudulent)")
plt.ylabel("Number of Records")
plt.show()

In [None]:
# Divide the dataset into training and testing sets while maintaining label proportions
Jondesc_Trn_df, Jondesc_Test_df = train_test_split(
    job_dataset_df, test_size=0.2, random_state=1107, stratify=job_dataset_df['label'])

In [None]:
# Transform training and testing DataFrames into Hugging Face Dataset objects
Ds_Jobdesc_trn = Dataset.from_pandas(Jondesc_Trn_df)
Ds_Jobdesc_test = Dataset.from_pandas(Jondesc_Test_df)

In [None]:
# Initialize the tokenizer from the selected BERT model
LLM_Model_Select = "bert-base-uncased"
Bert_JD_Tknzr = AutoTokenizer.from_pretrained(LLM_Model_Select)

In [None]:
# Define a tokenization function to process batches of text using the BERT tokenizer
def tokenize_jobdesc_text(batch_data):
    """
    Tokenizes a batch of text inputs using the loaded BERT tokenizer. This includes padding each
    text to a uniform length and truncating texts that exceed the model's maximum input size.
    These steps ensure the data is properly formatted for transformer-based models.

    Parameters:
    batch_data : dict
        A dictionary containing one or more text inputs under the "text" key.

    Returns:
    dict
        A dictionary with tokenized output including input IDs, attention masks, etc.
    """
    return Bert_JD_Tknzr(batch_data["text"], padding="max_length", truncation=True)

# Apply the tokenization function to the training and testing datasets
Ds_Jobdesc_trn = Ds_Jobdesc_trn.map(tokenize_jobdesc_text, batched=True)
Ds_Jobdesc_test = Ds_Jobdesc_test.map(tokenize_jobdesc_text, batched=True)

In [None]:
# Load a pre-trained BERT model configured for binary text classification
Finetuned_JD_Model = AutoModelForSequenceClassification.from_pretrained(LLM_Model_Select, num_labels=2)

In [None]:
# Define key training parameters for the model
Btch_jobdesc_Size = 8
# Determine how often to log based on dataset size and batch size
Jobdesc_Log_Steps = len(Ds_Jobdesc_trn) // Btch_jobdesc_Size
# Extract model identifier from full model path (useful if custom path is given)
LLM_Model_Select = LLM_Model_Select.split("/")[-1]

In [None]:
# Set up model training configuration including batch size, learning rate, precision, and output handling
TrainingArga_JobDesc = TrainingArguments(
    output_dir=f"{LLM_Model_Select}-Model",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=Btch_jobdesc_Size,
    per_device_eval_batch_size=Btch_jobdesc_Size,
    push_to_hub=False,
    fp16=torch.cuda.is_available(),
    logging_steps=Jobdesc_Log_Steps,
    report_to="none"
)

In [None]:
# Load accuracy evaluation metric for model performance tracking
Jobdesc_AccuracyCal = evaluate.load("accuracy")

# Define a metric computation function to evaluate classification accuracy
def compute_text_classification_metrics(prediction_tuple):
    """
    This function calculates the classification accuracy by comparing predicted labels
    to true labels. It processes the raw logits from the model to determine the
    predicted class, and then uses an accuracy metric to assess performance.

    Parameters:
    prediction_tuple : tuple
        Contains the model’s raw output logits and the true labels.

    Returns:
    dict
        Contains the computed accuracy score.
    """
    Jobdesc_Logits, Jobdesc_labels = prediction_tuple
    Jobdesc_preds = torch.argmax(torch.tensor(Jobdesc_Logits), dim=-1)
    return Jobdesc_AccuracyCal.compute(predictions=Jobdesc_preds, references=Jobdesc_labels)

In [None]:
# Initialize the Trainer class with model, data, training arguments, and evaluation function
Jobdesc_Trainer_det = Trainer(
    model=Finetuned_JD_Model,
    args=TrainingArga_JobDesc,
    train_dataset=Ds_Jobdesc_trn,
    eval_dataset=Ds_Jobdesc_test,
    compute_metrics=compute_text_classification_metrics
)
# Begin model training
Jobdesc_Trainer_det.train()

In [None]:
# Assess model performance using the test dataset
Jobdesc_Model_Eval = Jobdesc_Trainer_det.evaluate()
# Display evaluation results for the fine-tuned model on the job description data
print("Model Evaluation Summary on Job Classification Task:")
for key, value in Jobdesc_Model_Eval.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Generate predictions using the trained model and evaluate classification performance
Jobdesc_Model_Pred = Jobdesc_Trainer_det.predict(Ds_Jobdesc_test)
Actual_Values = Jobdesc_Model_Pred.label_ids
Jobdesc_Pred_Values = torch.argmax(torch.tensor(Jobdesc_Model_Pred.predictions), axis=1).numpy()
# Plot confusion matrix based on actual vs predicted labels
Eval_Jobdesc_ConfMat = confusion_matrix(Actual_Values, Jobdesc_Pred_Values)
Disp_Jobdesc_Confmat = ConfusionMatrixDisplay(confusion_matrix=Eval_Jobdesc_ConfMat)
Disp_Jobdesc_Confmat.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix for Job Description Classification")
plt.show()

In [None]:
# Extract raw job descriptions from the test dataset
Jobdesc_testing_Samples = Ds_Jobdesc_test["text"]

# Display a mix of prediction results (some 0s and some 1s)
print("\nSample Predictions from the Test Dataset:\n")
shown_labels = {0: 0, 1: 0}
i = 0

while sum(shown_labels.values()) < 6 and i < len(Jobdesc_Pred_Values):
    pred_label = Jobdesc_Pred_Values[i]

    if shown_labels[pred_label] < 2:  # Show up to 2 from each class
        print(f"Text: {Jobdesc_testing_Samples[i]}")
        print(f"Predicted Label : {pred_label}, True Label: {Actual_Values[i]}")
        print("-" * 60)
        shown_labels[pred_label] += 1

    i += 1
