In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-product-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4725197%2F8019295%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240423%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240423T213545Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1d3206684062f33cc8383a82dd1037707498acac06eacd292333c4d8158790c4d96fa98e3830dc885dd361b268ec578b4b6f7281e8a0433cb068df21cd50e190f4a1a42457c56abefc005d712188521608cf3470bf8d863dbaa52ddcfe3f41fc7d77cecd1ce80e90ecf31640242c58f7d0f21965ea009ade01cbfaad54ba0b9646b431a595b6a5b3fea57da5478786155f6b16f6c09e9dac1b830170ee530b163872d5eaff102564a8900493bf816198ffe6638af3cc437887f6679b162fe1c1e127bc355df09fc0efbc6b287b717a29fb4a385f30bee4e088a52296202a5f43ccac994b268022b7d3a140985d2569c6d6e0c11baae26721ea8678eeeba5693b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:

# Load the dataset
df = pd.read_csv('/kaggle/input/amazon-product-reviews/ratings_Electronics (1).csv', header=None)
df.columns = ['user_id', 'prod_id', 'rating', 'timestamp']

In [None]:

# Check the shape of the sampled dataframe
print(" DataFrame Shape:", df.shape)

 DataFrame Shape: (7824482, 4)


In [None]:
# Drop the last 7320000 rows from the DataFrame
df = df[:-7824382]

# Check the shape of the reduced dataframe
print("Reduced DataFrame Shape:", df.shape)


Reduced DataFrame Shape: (100, 4)


In [None]:

# Check the shape of the sampled dataframe
print(" DataFrame Shape:", df.shape)

 DataFrame Shape: (100, 4)


In [None]:

# Preprocess the dataset (convert 'rating' to text data and 'prod_id' as labels)
df['text'] = df['rating'].astype(str)
df['label'] = df['prod_id']

In [None]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:

# Define the class labels
class_labels = train_df['label'].unique().tolist()

In [None]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Tokenize and encode the text data
def tokenize_text(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for text in df['text']:
        encoded_dict = tokenizer.encode_plus(
                            str(text),
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

max_length = 128  # Maximum sequence length
batch_size = 32   # Batch size for training

# Tokenize and encode training and testing data
train_inputs, train_masks = tokenize_text(train_df, tokenizer, max_length)
test_inputs, test_masks = tokenize_text(test_df, tokenizer, max_length)


In [None]:
from tqdm import tqdm  # Import tqdm for progress bar

def tokenize_text(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    # Use tqdm to create a progress bar
    progress_bar = tqdm(total=len(df), desc="Tokenizing Text")

    for text in df['text']:
        encoded_dict = tokenizer.encode_plus(
                            str(text),
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt',
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

        # Update the progress bar
        progress_bar.update(1)

    # Close the progress bar after completion
    progress_bar.close()

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks


In [None]:
# Call the tokenize_text function with your DataFrame, tokenizer, and max_length parameters
train_inputs, train_masks = tokenize_text(train_df, tokenizer, max_length)
test_inputs, test_masks = tokenize_text(test_df, tokenizer, max_length)


Tokenizing Text: 100%|██████████| 80/80 [00:00<00:00, 3104.36it/s]
Tokenizing Text: 100%|██████████| 20/20 [00:00<00:00, 2861.83it/s]


In [None]:
# Convert labels to PyTorch tensors, handling unknown labels
train_labels = torch.tensor(train_df['label'].apply(lambda x: class_labels.index(x) if x in class_labels else -1).tolist())
test_labels = torch.tensor(test_df['label'].apply(lambda x: class_labels.index(x) if x in class_labels else -1).tolist())


In [None]:
# Create DataLoader for training and testing sets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(class_labels),  # Number of output classes
    output_attentions=False,
    output_hidden_states=False,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4  # Number of training epochs

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer
from tqdm import tqdm

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

total_epochs = 3  # Set the total number of epochs

for epoch in range(total_epochs):
    model.train()
    total_train_loss = 0

    # Integrate with tqdm for progress bar
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{total_epochs}', leave=False)

    for batch in progress_bar:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Update progress bar
        progress_bar.set_postfix({'Training loss': total_train_loss / len(progress_bar)})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}:')
    print(f'  Training loss: {avg_train_loss:.2f}')

print("Training completed!")


                                                                            

Epoch 1:
  Training loss: 2.95


                                                                             

Epoch 2:
  Training loss: 2.79


                                                                            

Epoch 3:
  Training loss: 2.71
Training completed!




In [None]:

# Evaluate the model on the test set
model.eval()

predictions, true_labels = [], []

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to('cpu').numpy()

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()

    predictions.extend(logits)
    true_labels.extend(b_labels)

In [None]:
# Calculate accuracy
predictions = np.argmax(predictions, axis=1)
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.3000


In [None]:
tokenizer_t5 = T5Tokenizer.from_pretrained('t5-base')
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

# Define problematic class labels
problematic_labels = ['0594296420', '0439886341']

# Generate summaries for each predicted class
for i, class_label in enumerate(class_labels):
    if class_label in problematic_labels:
        continue  # Skip problematic class labels

    # Filter test data for the current class label
    test_data_class = test_df[test_df['label'] == class_label]['text'].tolist()

    # Check if test data for the current class label is empty
    if not test_data_class:
        continue  # Skip if empty

    # Tokenize and encode the text data for T5 input
    input_ids_t5 = tokenizer_t5.batch_encode_plus(test_data_class, return_tensors='pt', max_length=512, truncation=True, padding='longest').input_ids.to(device)

    # Generate summaries
    with torch.no_grad():
        output = model_t5.generate(input_ids=input_ids_t5, max_length=150, num_beams=2, early_stopping=True)

    # Decode the generated summaries
    summaries = [tokenizer_t5.decode(summary, skip_special_tokens=True) for summary in output]

    # Print the summaries
    print(f"Class Label: {class_label}")
    for summary in summaries:
        print(summary)
    print("-----------------------------------------------------")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Class Label: 0594033896
4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0
-----------------------------------------------------
Class Label: 0594451647
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
-----------------------------------------------------
Class Label: 0528881469
5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0
2.0, 2.0 (2.02.0, 2.0 2.0 ( 2.0) 2.0 ( 2.0 2.0 ( 2.0) 2.0 ( 2.0) 2.0 ( 2.0 ( 2.0) 2.0 ( 2.0 2.0 2.0) 2.0 ( 2.0 ( 2.0 2.0 2.0) 2.0 ( 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 