<a href="https://colab.research.google.com/github/Harishchandu12/LLM_ResearchMethods/blob/main/LLM_ResearchMethods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries for data processing, visualization, text analysis, and machine learning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Install required libraries for transformer models and latest scikit-learn
!pip install transformers -q
!pip install -U scikit-learn -q
# Import PyTorch and model evaluation utilities
import torch
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
# Import necessary components from Hugging Face Transformers and PyTorch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
# Detect and assign the computing device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Import resampling method to handle class imbalance
from sklearn.utils import resample

In [None]:
# Load the women's clothing e-commerce review dataset from the provided GitHub URL
Clothesdata = 'https://raw.githubusercontent.com/Harishchandu12/LLM_ResearchMethods/refs/heads/main/Womens%20Clothing%20E-Commerce%20Reviews.csv'
Clothesdata_df = pd.read_csv(Clothesdata)
# Display the first five rows to preview the structure and content of the dataset
print(Clothesdata_df.head())

      Id  Clothing ID  Age                              Title  \
0  14746            0   26                                NaN   
1  22742            1   50    Love this under-all cami tank ?   
2  22743            1   36                       Staple tank!   
3  22749            1   24        Love but also very annoying   
4    876            2   28  Gorgeous top, straps way too long   

                                         Review Text  Rating  Recommended IND  \
0                                                NaN       5                1   
1  Originally i bought this in black and white. r...       5                1   
2  Love this tank. material and fit are great. lo...       5                1   
3  I love this tank, it is so comfortable but it ...       2                0   
4  I just adore this top! it is so comfy and styl...       4                1   

   Positive Feedback Count Division Name Department Name Class Name  
0                        0       General         Jac

In [None]:
# Display summary statistics for numerical columns in the dataset
Clothesdata_df.describe()

In [None]:
# Print the number of rows and columns in the dataset
row_no, col_no = Clothesdata_df.shape
print(f"The shape of the dataset is {row_no} reviews with {col_no} columns.")

In [None]:
# Display the count of missing values in each column
Clothesdata_df.isna().sum()

In [None]:
# Display the data types and non-null counts for each column
Clothesdata_df.info()

In [None]:
# Specify the columns that are not useful for the analysis
cols_to_drop = ['Id', 'Clothing ID', 'Title', 'Department Name', 'Class Name']
# Remove the specified columns from the dataset
Clothesdata_df = Clothesdata_df.drop(columns=cols_to_drop, errors='ignore')

In [None]:
# Filter out rows where 'Review Text' is either missing or an empty string
Cloths_Df_PP = Clothesdata_df[Clothesdata_df['Review Text'].notna() & (Clothesdata_df['Review Text'] != '')]
# Drop rows that have null values in the 'Division Name' column
Cloths_Df_PP = Cloths_Df_PP.dropna(subset=['Division Name'])

In [None]:
# Print the number of rows and columns after removing missing values
row_pp, col_pp = Cloths_Df_PP.shape
print(f"The shape of the dataset after pre-processing is {row_pp} reviews with {col_pp} columns.")

In [None]:
# Check for missing values in each column after data cleaning
Cloths_Df_PP.isna().sum()

In [None]:
# Download necessary resources from NLTK for stopwords and lemmatization
nltk.download('stopwords')
nltk.download('wordnet')
# Create a lemmatizer instance
lemmatizer_tool = WordNetLemmatizer()
# Load the set of English stopwords
stopword_list = set(stopwords.words('english'))
# Function to clean and standardize review text
def clean_review_text(input_text):
    """
    Cleans the input review by applying a sequence of text preprocessing steps:
    - Converts all characters to lowercase
    - Removes numbers and punctuation
    - Splits text into individual words (tokens)
    - Filters out common stopwords
    - Lemmatizes words to their root form
    - Reassembles the cleaned words into a single string

    Args:
        input_text (str): Raw review text to be processed

    Returns:
        str: The cleaned and processed review text
    """
    # Convert to lowercase
    lower_text = input_text.lower()
    # Remove punctuation and digits
    stripped_text = re.sub(r'[^a-z\s]', '', lower_text)
    # Split into tokens
    token_list = stripped_text.split()
    # Remove stopwords and apply lemmatization
    processed_tokens = [lemmatizer_tool.lemmatize(token) for token in token_list if token not in stopword_list]
    # Rejoin tokens into a string
    return ' '.join(processed_tokens)
# Apply the text cleaning function to the 'Review Text' column
Cloths_Df_PP['Cloth_rev_PP'] = Cloths_Df_PP['Review Text'].apply(clean_review_text)
# Display the original and cleaned review text side by side
Cloths_Df_PP[['Review Text', 'Cloth_rev_PP']]

In [None]:
# Function to classify sentiment based on the numeric rating
def map_rating_to_sentiment(rating):
    """
    Maps a numerical rating to a sentiment label.

    Args:
        rating (int or float): The review rating value

    Returns:
        str: Sentiment label based on rating:
             - 'Positive' for rating above 3
             - 'Neutral' for rating equal to 3
             - 'Negative' for rating below 3
    """
    if rating > 3:
        return "Positive"
    elif rating == 3:
        return "Neutral"
    else:
        return "Negative"
# Apply the sentiment mapping function to create a new column
Cloths_Df_PP['Sentiment'] = Cloths_Df_PP['Rating'].apply(map_rating_to_sentiment)

In [None]:
# Add a new column that stores the number of characters in each review
Cloths_Df_PP['review_length'] = Cloths_Df_PP['Review Text'].apply(len)

In [None]:
# Identify and eliminate duplicate records from the dataset
print(f"Number of duplicate rows before removal: {Cloths_Df_PP.duplicated().sum()}")
Cloths_Df_PP = Cloths_Df_PP.drop_duplicates()
print(f"Number of duplicate rows after removal: {Cloths_Df_PP.duplicated().sum()}")

In [None]:
# Display the first five entries of the cleaned dataset
Cloths_Df_PP.head()

In [None]:
# Display column data types and non-null counts after preprocessing
print(Cloths_Df_PP.info())

In [None]:
# List all column names in the dataset after preprocessing
Cloths_Df_PP.columns

In [None]:
# Plot a bar chart to visualize the distribution of sentiment labels
Cloths_Df_PP['Sentiment'].value_counts().plot(kind='bar',
                                              color=['green', 'orange', 'red'])
# Set the chart title and axis labels
plt.title('Distribution of Sentiment Classes')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
# Display the plot
plt.show()

In [None]:
# Rename the column that holds character count of reviews
Cloths_Df_PP['char_count_review'] = Cloths_Df_PP['Review Text'].apply(len)
# Create a histogram to visualize how review lengths are distributed
Cloths_Df_PP['char_count_review'].hist(bins='rice', color='lightgreen')
# Set the title and axis labels for the plot
plt.title('Distribution of Review Length')
plt.xlabel('Review Length (Character Count)')
plt.ylabel('Frequency')
# Display the histogram
plt.show()

In [None]:
# Define the numeric fields to include in the correlation heatmap
num_fields_corr = ['Rating', 'char_count_review']
# Calculate correlation values between the selected numeric columns
corr_values_matrix = Cloths_Df_PP[num_fields_corr].corr()
# Plot a heatmap to visualize the correlation matrix
sns.heatmap(corr_values_matrix, annot=True, cmap='Blues')
# Set the heatmap title
plt.title('Correlation Heatmap of Rating and Review Length')
# Display the heatmap
plt.show()

In [None]:
# Separate records by sentiment category
positive_reviews = Cloths_Df_PP[Cloths_Df_PP['Sentiment'] == 'Positive']
neutral_reviews = Cloths_Df_PP[Cloths_Df_PP['Sentiment'] == 'Neutral']
negative_reviews = Cloths_Df_PP[Cloths_Df_PP['Sentiment'] == 'Negative']
# Determine the count of the majority class
max_class_count = positive_reviews.shape[0]
# Upsample minority classes to match the majority class size
neutral_upsampled = resample(neutral_reviews, replace=True,
                             n_samples=max_class_count,random_state=7890)

negative_upsampled = resample(negative_reviews,replace=True,
                              n_samples=max_class_count,random_state=7890)
# Combine all classes into a balanced dataset
Cloths_Df_Balanced = pd.concat([positive_reviews, neutral_upsampled, negative_upsampled])
# Shuffle the rows to randomize class distribution
Cloths_Df_Balanced = Cloths_Df_Balanced.sample(frac=1, random_state=7890).reset_index(drop=True)
# Display the class distribution after balancing
print("Balanced Sentiment Class Distribution:")
print(Cloths_Df_Balanced['Sentiment'].value_counts())

In [None]:
# Split the balanced dataset into training and testing sets
Cloths_Df_Train, Cloths_Df_Test, cloths_df_Sen, cloths_df_Sen_Test = train_test_split(
    Cloths_Df_Balanced['Cloth_rev_PP'],Cloths_Df_Balanced['Sentiment'],
    test_size=0.2,random_state=7890,stratify=Cloths_Df_Balanced['Sentiment'],)
# Convert target labels to categorical type for model compatibility
cloths_df_Sen = cloths_df_Sen.astype('category')
cloths_df_Sen_Test = cloths_df_Sen_Test.astype('category')

In [None]:
# Load the pre-trained BERT tokenizer
Cloths_Tknzr = BertTokenizer.from_pretrained("bert-base-uncased")
# Tokenize and encode the training and testing text data
cloths_train_encoded = Cloths_Tknzr(Cloths_Df_Train.tolist(), truncation=True, padding=True, return_tensors='pt')
cloths_test_encoded = Cloths_Tknzr(Cloths_Df_Test.tolist(), truncation=True, padding=True, return_tensors='pt')
# Convert sentiment labels to numeric format
cloths_train_labels = torch.tensor(cloths_df_Sen.cat.codes.values)
cloths_test_labels = torch.tensor(cloths_df_Sen_Test.cat.codes.values)
# Construct TensorDatasets for model training and evaluation
cloths_train_dataset = TensorDataset(cloths_train_encoded['input_ids'],
                                     cloths_train_encoded['attention_mask'],
                                     cloths_train_labels)
cloths_test_dataset = TensorDataset(cloths_test_encoded['input_ids'],
                                    cloths_test_encoded['attention_mask'],
                                    cloths_test_labels)
# Encode sentiment labels into numeric codes and store in a new column
Cloths_Df_Balanced['Sentiment'] = Cloths_Df_Balanced['Sentiment'].astype('category')
Cloths_Df_Balanced['label'] = Cloths_Df_Balanced['Sentiment'].cat.codes
cloths_label_names = Cloths_Df_Balanced['Sentiment'].cat.categories.tolist()
# Display the sentiment label mapping
print("Label mapping:", dict(enumerate(cloths_label_names)))

In [None]:
# Load pre-trained BERT model for sequence classification with the appropriate number of sentiment labels
cloths_sentiment_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(cloths_label_names)).to(device)
# Define optimizer and loss function for model training
cloths_bert_optimizer = AdamW(cloths_sentiment_model.parameters(), lr=1e-5)
cloths_bert_loss_fn = nn.CrossEntropyLoss()

In [None]:
# Create DataLoaders for training and testing
cloths_train_loader = DataLoader(cloths_train_dataset, batch_size=8, shuffle=True)
cloths_test_loader = DataLoader(cloths_test_dataset, batch_size=8)
# Initialize lists to store accuracy and loss metrics across epochs
cloths_train_accuracies = []
cloths_test_accuracies = []
cloths_epoch_losses = []
# Start the training loop for a defined number of epochs
for epoch in range(10):
    # Set the model to training mode
    cloths_sentiment_model.train()
    # Reset tracking variables for each epoch
    cloths_epoch_train_loss = 0
    cloths_epoch_train_correct = 0
    cloths_epoch_train_total = 0
    # Iterate over training batches
    for batch in tqdm(cloths_train_loader, desc=f"Epoch {epoch+1} - Training"):
        cloths_input_ids, cloths_attention_mask, cloths_batch_labels = [b.to(device) for b in batch]
        cloths_batch_labels = cloths_batch_labels.long()
        # Clear previous gradients
        cloths_bert_optimizer.zero_grad()
        # Forward pass
        btch_op = cloths_sentiment_model(input_ids=cloths_input_ids, attention_mask=cloths_attention_mask)
        # Compute loss
        btch_loss = cloths_bert_loss_fn(btch_op.logits, cloths_batch_labels)
        # Backward pass and optimization step
        btch_loss.backward()
        cloths_bert_optimizer.step()
        # Accumulate total loss for the epoch
        cloths_epoch_train_loss += btch_loss.item()

        # Calculate correct predictions for training accuracy
        cloths_batch_preds = torch.argmax(btch_op.logits, dim=1)
        cloths_epoch_train_correct += (cloths_batch_preds == cloths_batch_labels).sum().item()
        cloths_epoch_train_total += cloths_batch_labels.size(0)

    # Compute and store training accuracy and average loss for the epoch
    epoch_train_acc = cloths_epoch_train_correct / cloths_epoch_train_total
    cloths_train_accuracies.append(epoch_train_acc)
    cloths_epoch_losses.append(cloths_epoch_train_loss / len(cloths_train_loader))
    # Set the model to evaluation mode for testing
    cloths_sentiment_model.eval()
    cloths_epoch_test_correct = 0
    cloths_epoch_test_total = 0
    # Evaluate on the test set without gradient calculation
    with torch.no_grad():
        for batch in cloths_test_loader:
            cloths_input_ids, cloths_attention_mask, cloths_batch_labels = [b.to(device) for b in batch]
            cloths_batch_labels = cloths_batch_labels.long()
            torch_op = cloths_sentiment_model(input_ids=cloths_input_ids, attention_mask=cloths_attention_mask)
            cloths_batch_preds = torch.argmax(torch_op.logits, dim=1)
            cloths_epoch_test_correct += (cloths_batch_preds == cloths_batch_labels).sum().item()
            cloths_epoch_test_total += cloths_batch_labels.size(0)
    # Compute and store test accuracy for the epoch
    epoch_test_acc = cloths_epoch_test_correct / cloths_epoch_test_total
    cloths_test_accuracies.append(epoch_test_acc)
    # Print training and testing metrics for the current epoch
    print(f"Epoch {epoch+1}: Train Acc = {epoch_train_acc:.4f}, Test Acc = {epoch_test_acc:.4f}, Loss = {cloths_epoch_train_loss / len(cloths_train_loader):.4f}")

In [None]:
# Create a list of epoch numbers for plotting
cloths_epoch_list = list(range(1, len(cloths_train_accuracies) + 1))
# Plot training and testing accuracy over epochs
plt.figure(figsize=(10, 6))
plt.plot(cloths_epoch_list, cloths_train_accuracies, marker='o', label='Training Accuracy')
plt.plot(cloths_epoch_list, cloths_test_accuracies, marker='o', label='Testing Accuracy')
# Set plot titles and axis labels
plt.title("Training vs Testing Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.xticks(cloths_epoch_list)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Set the model to evaluation mode
cloths_sentiment_model.eval()
# Create a DataLoader for the test dataset
cloths_test_loader = DataLoader(cloths_test_dataset, batch_size=8)
# Lists to store predicted and actual sentiment labels
cloths_final_preds = []
cloths_final_true = []
# Run inference without gradient computation
with torch.no_grad():
    for batch in cloths_test_loader:
        cloths_input_ids, cloths_attention_mask, cloths_batch_labels = [b.to(device) for b in batch]
        cloths_outputs = cloths_sentiment_model(input_ids=cloths_input_ids, attention_mask=cloths_attention_mask)
        cloths_preds = torch.argmax(cloths_outputs.logits, dim=1)
        cloths_final_preds.extend(cloths_preds.cpu().numpy())
        cloths_final_true.extend(cloths_batch_labels.cpu().numpy())
# Calculate and print accuracy and classification report
cloths_test_acc = accuracy_score(cloths_final_true, cloths_final_preds)
print(f"Test Accuracy: {cloths_test_acc:.2f}")
print("\nClassification Report:\n", classification_report(cloths_final_true, cloths_final_preds, target_names=cloths_label_names))

In [None]:
# Plot the confusion matrix using the final predictions and true labels
cloths_conf_matrix = confusion_matrix(cloths_final_true, cloths_final_preds)
cloths_cm_display = ConfusionMatrixDisplay(confusion_matrix=cloths_conf_matrix, display_labels=cloths_label_names)
cloths_cm_display.plot(cmap="Blues")
plt.title("Confusion Matrix for Sentiment Prediction")
plt.show()

In [None]:
# Select 5 random samples from the test set for comparison
cloths_sample_indices = np.random.choice(len(Cloths_Df_Test), 5, replace=False)
# Convert test labels to category codes
cloths_df_Sen_Test_cat = cloths_df_Sen_Test.cat.codes
# Loop through each selected sample and compare predictions
for idx in cloths_sample_indices:
    cloths_sample_text = Cloths_Df_Test.iloc[idx]
    true_label_code = cloths_df_Sen_Test_cat.iloc[idx]
    true_label = cloths_label_names[true_label_code]
    # Tokenize the review text for inference
    cloths_sample_inputs = Cloths_Tknzr(cloths_sample_text, return_tensors='pt', truncation=True, padding=True).to(device)
    # Predict sentiment using the trained model
    with torch.no_grad():
        cloths_sample_outputs = cloths_sentiment_model(**cloths_sample_inputs)
    cloths_predicted_label = cloths_label_names[torch.argmax(cloths_sample_outputs.logits, dim=1).item()]
    # Print the review, true label, and predicted label
    print("\nReview Text:", cloths_sample_text)
    print("Actual Sentiment   :", true_label)
    print("Predicted Sentiment:", cloths_predicted_label)