Key Preprocessing Steps Included
1. Text Cleaning:
  Lowercasing, removal of special characters, and excess whitespace as per the report.
2. Tokenization:
  Tokenized the reviews into subwords using the bert-base-uncased tokenizer.
3. Padding and Truncation:
  Reviews are padded or truncated to a fixed length (128 tokens).
4. Attention Masks:
  Automatically generated by the tokenizer to differentiate padding tokens from meaningful input.
5. Encoding Labels:
  Numerical ratings were converted to zero-based indices for compatibility with the classification head.
6. Train-Test Split:
Stratified split ensures that the class distribution is preserved across training and validation subsets.

In [1]:
# Import Libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import re
from transformers import BertTokenizer
import torch
from collections import Counter
from wordcloud import WordCloud
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
# Load Dataset
df = pd.read_csv('../data/fr_de_it_augmented.csv')
# Dataset Overview
print("Dataset shape:", df.shape)
print(df.head())


In [None]:
plt.figure(figsize=(8, 6))
ax = sns.countplot(x=df['Rating'])
# Add count on top of the bars
for container in ax.containers:
    ax.bar_label(container)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
# Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the text by cleaning and normalizing:
    - Converts text to lowercase
    - Removes special characters and extra whitespaces
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove excessive whitespaces
    return text

# Apply Preprocessing
df['Review'] = df['Review'].apply(preprocess_text)

# View Preprocessed Data
print(df.head())


In [None]:
# Initialize BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization Function
def tokenize_reviews(reviews, max_length=128):
    """
    Tokenizes and encodes reviews into BERT-compatible input format.
    - Subword tokenization
    - Truncation to max_length
    - Padding to max_length
    """
    return tokenizer.batch_encode_plus(
        reviews.tolist(),
        max_length=max_length,  # Maximum token length for BERT
        padding="max_length",  # Pad to max length
        truncation=True,  # Truncate long reviews
        return_tensors="pt"  # Return PyTorch tensors
    )

# Tokenize Reviews
tokenized_data = tokenize_reviews(df['Review'])
print("Tokenization complete.")


In [None]:
# Encode Labels (1 → 0, 2 → 1, ..., 5 → 4)
df['Rating'] = df['Rating'] - 1  # Convert ratings to 0-indexed
labels = torch.tensor(df['Rating'].values)

print("Labels encoded. Example:", labels[:5])


In [None]:
# Split Data into Training and Validation Sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    tokenized_data["input_ids"],
    tokenized_data["attention_mask"],
    labels,
    test_size=0.2,  # 20% for validation
    stratify=labels  # Stratified split to maintain class distribution
)

# Save Split Data
torch.save({
    "train_inputs": train_inputs,
    "val_inputs": val_inputs,
    "train_masks": train_masks,
    "val_masks": val_masks,
    "train_labels": train_labels,
    "val_labels": val_labels
}, "preprocessed_data_aug_fr_it_and_de.pt")

print("Train-Test split complete.")


In [None]:
# Download Preprocessed Data
from google.colab import files
files.download("preprocessed_data_aug_fr_it_and_de.pt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>