In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

# Load the dataset
df = pd.read_csv('/kaggle/input/amazon-product-reviews/ratings_Electronics (1).csv', header=None)
df.columns = ['user_id', 'prod_id', 'rating', 'timestamp']

# Drop unnecessary rows
df = df[:-7824382]

# Preprocess the dataset
df['text'] = df['rating'].astype(str)
df['label'] = df['prod_id']

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the class labels
class_labels = train_df['label'].unique().tolist()

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode the text data for classification
def tokenize_text(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for text in df['text']:
        encoded_dict = tokenizer.encode_plus(
                            str(text),
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                       )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

max_length = 128  # Maximum sequence length
batch_size = 32   # Batch size for training

# Tokenize and encode training and testing data for classification
train_inputs, train_masks = tokenize_text(train_df, tokenizer, max_length)
test_inputs, test_masks = tokenize_text(test_df, tokenizer, max_length)

# Convert labels to PyTorch tensors, handling unknown labels
train_labels = torch.tensor(train_df['label'].apply(lambda x: class_labels.index(x) if x in class_labels else -1).tolist())
test_labels = torch.tensor(test_df['label'].apply(lambda x: class_labels.index(x) if x in class_labels else -1).tolist())

# Create DataLoader for training and testing sets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Load the pre-trained BERT model for sequence classification
model_classification = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=len(class_labels),  # Number of output classes
    output_attentions=False, 
    output_hidden_states=False, 
)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model_classification.parameters(), lr=2e-5, eps=1e-8)
epochs = 4  # Number of training epochs

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the classification model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_classification.to(device)

total_epochs = 3  # Set the total number of epochs

for epoch in range(total_epochs):
    model_classification.train()
    total_train_loss = 0
    
    # Integrate with tqdm for progress bar
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{total_epochs}', leave=False)
    
    for batch in progress_bar:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model_classification.zero_grad()        

        outputs = model_classification(b_input_ids, 
                                       token_type_ids=None, 
                                       attention_mask=b_input_mask, 
                                       labels=b_labels)

        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_classification.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        # Update progress bar
        progress_bar.set_postfix({'Training loss': total_train_loss / len(progress_bar)})
    
    avg_train_loss = total_train_loss / len(train_dataloader)            
    print(f'Epoch {epoch + 1}:')
    print(f'  Training loss: {avg_train_loss:.2f}')

print("Training completed!")

# Evaluate the classification model on the test set
model_classification.eval()

predictions, true_labels = [], []

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to('cpu').numpy()

    with torch.no_grad():        
        outputs = model_classification(b_input_ids, 
                                       token_type_ids=None, 
                                       attention_mask=b_input_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()

    predictions.extend(logits)
    true_labels.extend(b_labels)
    
# Calculate accuracy
predictions = np.argmax(predictions, axis=1)
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')

# Load the T5 tokenizer and model for summarization
tokenizer_summarization = T5Tokenizer.from_pretrained('t5-base')
model_summarization = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

# Generate summaries for each class label
for class_label in class_labels:
    # Filter test data for the current class label
    test_data_class = test_df[test_df['label'] == class_label]['text'].tolist()
    
    # Tokenize and encode the text data for T5 input
    input_ids_t5 = tokenizer_summarization.batch_encode_plus(test_data_class, 
                                                             return_tensors='pt', 
                                                             max_length=512, 
                                                             truncation=True, 
                                                             padding='longest').input_ids.to(device)
    
    # Generate summaries
    with torch.no_grad():
        output = model_summarization.generate(input_ids=input_ids_t5, 
                                               max_length=150, 
                                               num_beams=2, 
                                               early_stopping=True)
        
    # Decode the generated summaries
    summaries = [tokenizer_summarization.decode(summary, skip_special_tokens=True) for summary in output]

    # Print the class label and its corresponding summaries
    print(f"Class Label: {class_label}")
    for summary in summaries:
        print(summary)
    print("-----------------------------------------------------")


In [None]:
# Load the T5 tokenizer and model for summarization
tokenizer_summarization = T5Tokenizer.from_pretrained('t5-base')
model_summarization = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

# Generate summaries for each class label
for class_label in class_labels:
    try:
        # Filter test data for the current class label
        test_data_class = test_df[test_df['label'] == class_label]['text'].tolist()
        
        # Tokenize and encode the text data for T5 input
        input_ids_t5 = tokenizer_summarization.batch_encode_plus(test_data_class, 
                                                                 return_tensors='pt', 
                                                                 max_length=512, 
                                                                 truncation=True, 
                                                                 padding='longest').input_ids.to(device)
        
        # Generate summaries
        with torch.no_grad():
            output = model_summarization.generate(input_ids=input_ids_t5, 
                                                   max_length=150, 
                                                   num_beams=2, 
                                                   early_stopping=True)
            
        # Decode the generated summaries
        summaries = [tokenizer_summarization.decode(summary, skip_special_tokens=True) for summary in output]

        # Print the class label and its corresponding summaries
        print(f"Class Label: {class_label}")
        for summary in summaries:
            print(summary)
        print("-----------------------------------------------------")
    except ValueError:

        continue