# Sentiment Analysis using BERT with Single Text and CSV Batch Processing

In [28]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from scipy.stats import wasserstein_distance
from sklearn.model_selection import train_test_split

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"./bert_tokenizer")
model = BertForSequenceClassification.from_pretrained(r"./bert_model")
model.eval()

# Function to make predictions
def predict_sentiment(input_data):
    inputs = tokenizer(input_data, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

# Choose input type: text or CSV
input_type = input("Enter 'text' for single text input or 'csv' for batch processing from a CSV file: ").strip().lower()

if input_type == 'text':
    # Single text input
    input_text = input("Enter your review: ")
    predicted_class = predict_sentiment(input_text)
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    print(f"Predicted sentiment: {sentiment} (Class: {predicted_class})")

    # Save the review to a CSV file in append mode
    review_records = [input_text]
    output_review_csv_path = r"./text_review.csv"
    review_df = pd.DataFrame(review_records, columns=['review'])
    review_df.to_csv(output_review_csv_path, mode='a', index=False, header=not pd.io.common.file_exists(output_review_csv_path))
    print(f"Review saved to {output_review_csv_path}.")

elif input_type == 'csv':
    # Batch processing from CSV
    input_csv_path = input("Enter the path to your CSV file: ").strip()
    reviews_df = pd.read_csv(input_csv_path)

    # Check for either 'review' or 'reviews' column
    review_column = 'review' if 'review' in reviews_df.columns else 'reviews' if 'reviews' in reviews_df.columns else None
    if not review_column:
        raise ValueError("The input CSV must contain either a 'review' or 'reviews' column.")

    # Initialize a list to store predictions
    predicted_labels = []
    review_records = []  # To store the reviews for CSV saving

    # Batch prediction
    for review in tqdm(reviews_df[review_column], desc="Processing reviews"):
        predicted_class = predict_sentiment(review)
        predicted_labels.append(predicted_class)
        review_records.append(review)  # Collect the review for saving

    # Add predictions to DataFrame and save
    reviews_df['predicted_label'] = predicted_labels
    reviews_df['predicted_sentiment'] = reviews_df['predicted_label'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

    output_csv_path = r"csv_reviews.csv"
    reviews_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}.")

    # Save only the reviews to a CSV file in append mode
    output_review_csv_path = r".\csv_reviews.csv"
    review_df = pd.DataFrame(review_records, columns=['review'])
    review_df.to_csv(output_review_csv_path, mode='a', index=False, header=not pd.io.common.file_exists(output_review_csv_path))
    print(f"Reviews saved to {output_review_csv_path}.")

Enter 'text' for single text input or 'csv' for batch processing from a CSV file:  csv
Enter the path to your CSV file:  C:\Users\kumar_lf3uub3\Desktop\CloudSEK\PS-1\csv_testing_data.csv


Processing reviews: 100%|████████████████████████████████████████████████████████████| 989/989 [11:04<00:00,  1.49it/s]

Predictions saved to csv_reviews.csv.
Reviews saved to .\csv_reviews.csv.





# Data Drift Detection Between Training Data And New Data ALong With Confidence Scores

In [29]:
import pandas as pd
import torch
from tqdm import tqdm
from scipy.stats import wasserstein_distance

# Function to safely parse embeddings from strings
def parse_embedding(embedding_str):
    # Ensure the string is treated as a list of floats
    return list(map(float, embedding_str.strip("[]").split()))

# Function to generate embeddings
def generate_embeddings(reviews):
    embeddings = []
    for review in tqdm(reviews, desc="Generating embeddings"):
        inputs = tokenizer(review, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            embedding = logits.numpy().flatten()  # Flatten logits as embedding
        embeddings.append(embedding)
    return embeddings

# Load reviews from 'csv_reviews.csv'
input_csv_path = r"csv_reviews.csv"
review_df = pd.read_csv(input_csv_path)

# Check if there are at least 900 reviews
if len(review_df) >= 900:
    print("Generating embeddings and starting data drift calculation...")

    # Generate embeddings for reviews
    review_embeddings = generate_embeddings(review_df['review'].tolist())
    review_df['embedding'] = review_embeddings

    # Save updated 'csv_reviews.csv' with embeddings
    review_df.to_csv(input_csv_path, index=False)
    print(f"Embeddings added to existing file: {input_csv_path}")

    # Load historical embeddings
    historical_embeddings_csv_path = r"embeddings.csv"
    historical_embeddings_df = pd.read_csv(historical_embeddings_csv_path)

    # Parse stored embeddings from strings
    historical_embeddings = historical_embeddings_df['embedding'].apply(parse_embedding).tolist()
    current_embeddings = review_df['embedding'].tolist()

    # Calculate mean embeddings
    historical_mean_embedding = torch.mean(torch.tensor(historical_embeddings), dim=0).numpy()
    current_mean_embedding = torch.mean(torch.tensor(current_embeddings), dim=0).numpy()

    # Calculate Wasserstein Distance for drift detection
    drift_score = wasserstein_distance(historical_mean_embedding, current_mean_embedding)
    print(f"Drift Score: {drift_score}")

    # Optional: Threshold for drift detection
    threshold = 1
    if drift_score > threshold:
        print("Significant drift detected. Generating predictions for reviews.")

        # Initialize lists for predictions and confidence scores
        predicted_labels = []
        confidence_scores = []

        # Predict labels and confidence scores
        for review in tqdm(review_df['review'], desc="Predicting labels"):
            inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)
                confidence, predicted_class = torch.max(probabilities, dim=1)
                predicted_labels.append(predicted_class.item())
                confidence_scores.append(confidence.item())

        # Add labels and confidence scores to DataFrame
        review_df['predicted_label'] = predicted_labels
        review_df['confidence_score'] = confidence_scores

        # Filter by confidence score >= 0.8
        review_df = review_df[review_df['confidence_score'] >= 0.8]

        # Save the retrained model
        model.save_pretrained("./retrained_bert_model")
        tokenizer.save_pretrained("./retrained_bert_tokenizer")

        # Save the updated CSV
        review_df.to_csv(input_csv_path, index=False)
        print(f"Updated review predictions saved to {input_csv_path}.")
    else:
        print("No significant drift detected. No predictions needed.")
else:
    print(f"Number of reviews: {len(review_df)}. Waiting for at least 900 reviews.")

Generating embeddings and starting data drift calculation...


Generating embeddings: 100%|███████████████████████████████████████████████████████| 1978/1978 [21:36<00:00,  1.53it/s]


Embeddings added to existing file: csv_reviews.csv
Drift Score: 1.1968214809894562
Significant drift detected. Generating predictions for reviews.


Predicting labels: 100%|███████████████████████████████████████████████████████████| 1978/1978 [19:58<00:00,  1.65it/s]


Updated review predictions saved to csv_reviews.csv.


# Model Retraining Script Using New Data for Sentiment Analysis

In [30]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load data
data_path = "csv_reviews.csv"
data_df = pd.read_csv(data_path)

# Ensure the data has the necessary columns
assert 'review' in data_df.columns and 'predicted_label' in data_df.columns, "CSV must contain 'review' and 'predicted_label' columns."

# Split the data into training and validation sets
train_df, val_df = train_test_split(data_df, test_size=0.1, random_state=42)

# Define custom Dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            review,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("./bert_tokenizer")
model = BertForSequenceClassification.from_pretrained("./bert_model")
model.train()

# Prepare datasets and dataloaders
train_dataset = ReviewDataset(train_df['review'].tolist(), train_df['predicted_label'].tolist(), tokenizer)
val_dataset = ReviewDataset(val_df['review'].tolist(), val_df['predicted_label'].tolist(), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Epoch {epoch + 1} | Training Loss: {total_train_loss / len(train_loader)}")

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        print(f"Epoch {epoch + 1} | Validation Loss: {total_val_loss / len(val_loader)}")

# Train the model
train_model(model, train_loader, val_loader, optimizer, scheduler)

# Save the retrained model and tokenizer
model.save_pretrained("./retrained_bert_model")
tokenizer.save_pretrained("./retrained_bert_tokenizer")
print("Retraining completed and model saved.")


Training Epoch 1: 100%|██████████████████████████████████████████████████████████████| 203/203 [23:43<00:00,  7.01s/it]


Epoch 1 | Training Loss: 0.0879277993966206


Validation: 100%|██████████████████████████████████████████████████████████████████████| 23/23 [00:47<00:00,  2.05s/it]


Epoch 1 | Validation Loss: 0.02114323859669916


Training Epoch 2: 100%|██████████████████████████████████████████████████████████████| 203/203 [23:44<00:00,  7.02s/it]


Epoch 2 | Training Loss: 0.00633560698748947


Validation: 100%|██████████████████████████████████████████████████████████████████████| 23/23 [00:48<00:00,  2.11s/it]


Epoch 2 | Validation Loss: 0.0020671287341468524


Training Epoch 3: 100%|██████████████████████████████████████████████████████████████| 203/203 [23:46<00:00,  7.03s/it]


Epoch 3 | Training Loss: 0.00182285575408906


Validation: 100%|██████████████████████████████████████████████████████████████████████| 23/23 [00:49<00:00,  2.13s/it]


Epoch 3 | Validation Loss: 0.0009471750729616084
Retraining completed and model saved.


In [31]:
# full code


# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from scipy.stats import wasserstein_distance
from sklearn.model_selection import train_test_split

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"./bert_tokenizer")
model = BertForSequenceClassification.from_pretrained(r"./bert_model")
model.eval()

# Function to make predictions
def predict_sentiment(input_data):
    inputs = tokenizer(input_data, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

# Choose input type: text or CSV
input_type = input("Enter 'text' for single text input or 'csv' for batch processing from a CSV file: ").strip().lower()

if input_type == 'text':
    # Single text input
    input_text = input("Enter your review: ")
    predicted_class = predict_sentiment(input_text)
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    print(f"Predicted sentiment: {sentiment} (Class: {predicted_class})")

    # Save the review to a CSV file in append mode
    review_records = [input_text]
    output_review_csv_path = r"./text_review.csv"
    review_df = pd.DataFrame(review_records, columns=['review'])
    review_df.to_csv(output_review_csv_path, mode='a', index=False, header=not pd.io.common.file_exists(output_review_csv_path))
    print(f"Review saved to {output_review_csv_path}.")

elif input_type == 'csv':
    # Batch processing from CSV
    input_csv_path = input("Enter the path to your CSV file: ").strip()
    reviews_df = pd.read_csv(input_csv_path)

    # Check for either 'review' or 'reviews' column
    review_column = 'review' if 'review' in reviews_df.columns else 'reviews' if 'reviews' in reviews_df.columns else None
    if not review_column:
        raise ValueError("The input CSV must contain either a 'review' or 'reviews' column.")

    # Initialize a list to store predictions
    predicted_labels = []
    review_records = []  # To store the reviews for CSV saving

    # Batch prediction
    for review in tqdm(reviews_df[review_column], desc="Processing reviews"):
        predicted_class = predict_sentiment(review)
        predicted_labels.append(predicted_class)
        review_records.append(review)  # Collect the review for saving

    # Add predictions to DataFrame and save
    reviews_df['predicted_label'] = predicted_labels
    reviews_df['predicted_sentiment'] = reviews_df['predicted_label'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

    output_csv_path = r"csv_reviews.csv"
    reviews_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}.")

    # Save only the reviews to a CSV file in append mode
    output_review_csv_path = r".\csv_reviews.csv"
    review_df = pd.DataFrame(review_records, columns=['review'])
    review_df.to_csv(output_review_csv_path, mode='a', index=False, header=not pd.io.common.file_exists(output_review_csv_path))
    print(f"Reviews saved to {output_review_csv_path}.")
import pandas as pd
import torch
from tqdm import tqdm
from scipy.stats import wasserstein_distance

# Function to safely parse embeddings from strings
def parse_embedding(embedding_str):
    # Ensure the string is treated as a list of floats
    return list(map(float, embedding_str.strip("[]").split()))

# Function to generate embeddings
def generate_embeddings(reviews):
    embeddings = []
    for review in tqdm(reviews, desc="Generating embeddings"):
        inputs = tokenizer(review, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            embedding = logits.numpy().flatten()  # Flatten logits as embedding
        embeddings.append(embedding)
    return embeddings

# Load reviews from 'csv_reviews.csv'
input_csv_path = r"csv_reviews.csv"
review_df = pd.read_csv(input_csv_path)

# Check if there are at least 900 reviews
if len(review_df) >= 900:
    print("Generating embeddings and starting data drift calculation...")

    # Generate embeddings for reviews
    review_embeddings = generate_embeddings(review_df['review'].tolist())
    review_df['embedding'] = review_embeddings

    # Save updated 'csv_reviews.csv' with embeddings
    review_df.to_csv(input_csv_path, index=False)
    print(f"Embeddings added to existing file: {input_csv_path}")

    # Load historical embeddings
    historical_embeddings_csv_path = r"embeddings.csv"
    historical_embeddings_df = pd.read_csv(historical_embeddings_csv_path)

    # Parse stored embeddings from strings
    historical_embeddings = historical_embeddings_df['embedding'].apply(parse_embedding).tolist()
    current_embeddings = review_df['embedding'].tolist()

    # Calculate mean embeddings
    historical_mean_embedding = torch.mean(torch.tensor(historical_embeddings), dim=0).numpy()
    current_mean_embedding = torch.mean(torch.tensor(current_embeddings), dim=0).numpy()

    # Calculate Wasserstein Distance for drift detection
    drift_score = wasserstein_distance(historical_mean_embedding, current_mean_embedding)
    print(f"Drift Score: {drift_score}")

    # Optional: Threshold for drift detection
    threshold = 1
    if drift_score > threshold:
        print("Significant drift detected. Generating predictions for reviews.")

        # Initialize lists for predictions and confidence scores
        predicted_labels = []
        confidence_scores = []

        # Predict labels and confidence scores
        for review in tqdm(review_df['review'], desc="Predicting labels"):
            inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)
                confidence, predicted_class = torch.max(probabilities, dim=1)
                predicted_labels.append(predicted_class.item())
                confidence_scores.append(confidence.item())

        # Add labels and confidence scores to DataFrame
        review_df['predicted_label'] = predicted_labels
        review_df['confidence_score'] = confidence_scores

        # Filter by confidence score >= 0.8
        review_df = review_df[review_df['confidence_score'] >= 0.8]

        # Save the retrained model
        model.save_pretrained("./retrained_bert_model")
        tokenizer.save_pretrained("./retrained_bert_tokenizer")

        # Save the updated CSV
        review_df.to_csv(input_csv_path, index=False)
        print(f"Updated review predictions saved to {input_csv_path}.")
    else:
        print("No significant drift detected. No predictions needed.")
else:
    print(f"Number of reviews: {len(review_df)}. Waiting for at least 900 reviews.")
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load data
data_path = "csv_reviews.csv"
data_df = pd.read_csv(data_path)

# Ensure the data has the necessary columns
assert 'review' in data_df.columns and 'predicted_label' in data_df.columns, "CSV must contain 'review' and 'predicted_label' columns."

# Split the data into training and validation sets
train_df, val_df = train_test_split(data_df, test_size=0.1, random_state=42)

# Define custom Dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            review,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("./bert_tokenizer")
model = BertForSequenceClassification.from_pretrained("./bert_model")
model.train()

# Prepare datasets and dataloaders
train_dataset = ReviewDataset(train_df['review'].tolist(), train_df['predicted_label'].tolist(), tokenizer)
val_dataset = ReviewDataset(val_df['review'].tolist(), val_df['predicted_label'].tolist(), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Epoch {epoch + 1} | Training Loss: {total_train_loss / len(train_loader)}")

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        print(f"Epoch {epoch + 1} | Validation Loss: {total_val_loss / len(val_loader)}")

# Train the model
train_model(model, train_loader, val_loader, optimizer, scheduler)

# Save the retrained model and tokenizer
model.save_pretrained("./retrained_bert_model")
tokenizer.save_pretrained("./retrained_bert_tokenizer")
print("Retraining completed and model saved.")