In [19]:
import torch
from transformers import BertModel, BertTokenizer
import json
import pandas as pd
import gzip
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [21]:
import json
import pandas as pd

# Load the dataset
data = []
with open('/kaggle/input/amazon-fashion/AMAZON_FASHION_5.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [23]:
df = df[df["reviewText"].apply(lambda x: isinstance(x, str))]

In [24]:
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
1,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Tonya B.,Great product and price!,Five Stars,1441324800,,
2,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Tonya B.,Great product and price!,Five Stars,1441324800,,
3,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue (37867...",Tonya B.,Great product and price!,Five Stars,1441324800,,
4,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3171,5.0,True,"07 2, 2018",A2077NII5H62R2,B005AGO4LU,"{'Size:': ' 8.5 B(M) US', 'Color:': ' Green Gl...",Amazon Customer,Perfect fit!,Five Stars,1530489600,,
3172,5.0,True,"06 28, 2018",A2IBS6PIPAGAB5,B005AGO4LU,"{'Size:': ' 5 B(M) US', 'Color:': ' Wolf Grey/...",J. Avila,My favorite cross trainers!,Comfortable,1530144000,,
3173,5.0,True,"06 25, 2018",A1GTC5EVSJNCQ8,B005AGO4LU,"{'Size:': ' 8 B(M) US', 'Color:': ' Blue Tint/...",Amazon Customer,Love them fit perfect,Five Stars,1529884800,,
3174,5.0,True,"06 20, 2018",A311XHHLM12MUT,B005AGO4LU,"{'Size:': ' 9 B(M) US', 'Color:': ' Blue Tint/...",Peter,Favorite Nike shoe ever! The flex sole is exce...,Love them!,1529452800,,


In [25]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function to tokenize the text
def tokenize_texts(texts, max_length=128):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

input_texts = df["reviewText"].tolist()

# Tokenize the reviewText column
tokens = tokenize_texts(input_texts)

input_ids = tokens['input_ids']
attention_masks = tokens['attention_mask']

# Convert labels to tensor
labels = torch.tensor(df['overall'].apply(lambda x: 1 if x >= 4 else 0).values)

In [29]:
missing_reviews = df['reviewText'].isnull().sum()
print(f"Number of missing reviews: {missing_reviews}")

# Check for non-string types in the reviewText column
non_string_reviews = df[~df['reviewText'].apply(lambda x: isinstance(x, str))]
print(f"Number of non-string reviews: {len(non_string_reviews)}")

Number of missing reviews: 0
Number of non-string reviews: 0


In [31]:
# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move the model to the appropriate device
model.to(device)

# Create a TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [33]:
# Define the DataLoader for training and validation sets
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [35]:
# Define the optimizer and the learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Define the loss function
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        # Load data to the appropriate device
        batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)
        
        # Zero out gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        
        # Get the loss and logit scores
        loss, logits = outputs.loss, outputs.logits
        
        # Backward pass
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        # Accumulate the total loss
        total_loss += loss.item()
    
    # Average training loss for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}')
    
    # Validation loop
    model.eval()
    val_loss = 0
    val_correct = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
            loss, logits = outputs.loss, outputs.logits
            val_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            val_correct += (predicted == batch_labels).sum().item()
    
    # Average validation loss for the epoch
    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = val_correct / len(val_dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Evaluation metrics
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in val_dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}')

Epoch 1/3, Average Training Loss: 0.1281
Epoch 1/3, Validation Loss: 0.0177, Validation Accuracy: 0.9921
Epoch 2/3, Average Training Loss: 0.0115
Epoch 2/3, Validation Loss: 0.0166, Validation Accuracy: 0.9937
Epoch 3/3, Average Training Loss: 0.0110
Epoch 3/3, Validation Loss: 0.0150, Validation Accuracy: 0.9953
Accuracy: 0.9953, F1 Score: 0.9972


In [36]:
# Example test data
example_reviews = [
    "This product exceeded my expectations. I highly recommend it!",
    "Very disappointed with the quality of this product. Would not buy again.",
    "The color and fit of this shirt are perfect. I love it!",
    "Terrible experience with this company. Will never purchase from them again.",
    "The delivery was fast and the packaging was great. Very satisfied with my purchase."
]

# Tokenize the example reviews
example_tokens = tokenize_texts(example_reviews)

# Extract input IDs and attention masks
example_input_ids = example_tokens['input_ids']
example_attention_masks = example_tokens['attention_mask']

# Move the data to the appropriate device
example_input_ids = example_input_ids.to(device)
example_attention_masks = example_attention_masks.to(device)

# Make predictions
model.eval()
with torch.no_grad():
    example_outputs = model(input_ids=example_input_ids, attention_mask=example_attention_masks)
    example_logits = example_outputs.logits
    _, example_predictions = torch.max(example_logits, 1)

# Decode predictions
decoded_predictions = ["Positive" if pred.item() == 1 else "Negative" for pred in example_predictions]

# Display results
for review, prediction in zip(example_reviews, decoded_predictions):
    print(f"Review: {review}")
    print(f"Sentiment Prediction: {prediction}")
    print()


Review: This product exceeded my expectations. I highly recommend it!
Sentiment Prediction: Positive

Review: Very disappointed with the quality of this product. Would not buy again.
Sentiment Prediction: Negative

Review: The color and fit of this shirt are perfect. I love it!
Sentiment Prediction: Positive

Review: Terrible experience with this company. Will never purchase from them again.
Sentiment Prediction: Positive

Review: The delivery was fast and the packaging was great. Very satisfied with my purchase.
Sentiment Prediction: Positive

