## BERT

In [11]:
import torch
from torch import nn
import pytorch_lightning as pl
from torchmetrics import Accuracy
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import pandas as pd
import numpy as np
from pytorch_lightning.loggers import TensorBoardLogger
import lightning as L

In [3]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, ratings):
        self.reviews = reviews
        self.ratings = ratings
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        rating = self.ratings[idx]
        encoding = self.tokenizer(review, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return input_ids, attention_mask, torch.tensor(rating)


class ReviewDataModule(pl.LightningDataModule):
    def __init__(self, data_file, batch_size=8, val_split=0.2, num_workers=7):
        super().__init__()
        self.data_file = data_file
        self.batch_size = batch_size
        self.val_split = val_split
        self.num_workers = num_workers

    def prepare_data(self):
        # Load data from CSV file
        data = pd.read_csv(self.data_file)
        self.reviews = data['review'].tolist()
        self.ratings = data['rating'].tolist()

    def setup(self, stage=None):
        dataset = ReviewDataset(self.reviews, self.ratings)
        val_size = int(len(dataset) * self.val_split)
        train_size = len(dataset) - val_size
        self.train_dataset, self.val_dataset = random_split(dataset, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

In [9]:
class ReviewRatingModel(pl.LightningModule):
    def __init__(self, num_classes: int, learning_rate: float = 1e-4):
        super().__init__()
        self.save_hyperparameters()

        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.criterion = nn.CrossEntropyLoss()
        self.accuracy = Accuracy(task='multiclass', num_classes=num_classes)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        loss = self.criterion(logits, labels)
        self.log('train_loss', loss)
        self.log('train_acc', self.accuracy(logits, labels), prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        loss = self.criterion(logits, labels)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', self.accuracy(logits, labels), prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

In [17]:
logger = TensorBoardLogger("logs", name="dev")

dm = ReviewDataModule("data/train_data.csv", batch_size=4, num_workers=16)
model = ReviewRatingModel(num_classes=5)
trainer = pl.Trainer(max_epochs=5, logger=logger)
trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type               | Params
--------------------------------------------------
0 | bert       | DistilBertModel    | 66.4 M
1 | classifier | Linear             | 3.8 K 
2 | criterion  | CrossEntropyLoss   | 0     
3 | accuracy   | MulticlassAccuracy | 0     
--------------------------------------------------
66.4 M    Trainable params
0         Non-trainable params
66.4 M    Total params
265.467   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

## Other methods

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('data/train_data.csv')

vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=1500)

train_data_features = vectorizer.fit_transform(df['review'])

train_data_features = train_data_features.toarray()
print('Bag of words completed')

Bag of words completed


## TF-IDF

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = train_data_features
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['rating'], test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.6090271424214699
              precision    recall  f1-score   support

           0       0.69      0.59      0.63       242
           1       0.44      0.41      0.42       277
           2       0.40      0.23      0.29       352
           3       0.52      0.50      0.51       961
           4       0.70      0.82      0.76      1447

    accuracy                           0.61      3279
   macro avg       0.55      0.51      0.52      3279
weighted avg       0.59      0.61      0.60      3279


### characters

In [29]:
review_lengths = df['review'].apply(lambda x: len(x))

min_length = review_lengths.min()
max_length = review_lengths.max()
avg_length = review_lengths.mean()
stddev_length = review_lengths.std()

print(f"min: {min_length}")
print(f"max: {max_length}")
print(f"avg: {avg_length}")
print(f"std: {stddev_length}")

min: 44
max: 13501
avg: 721.2689116642265
std: 676.4062576996707


### words

In [30]:
review_lengths = df['review'].apply(lambda x: len(x.split()))

min_length = review_lengths.min()
max_length = review_lengths.max()
avg_length = review_lengths.mean()
stddev_length = review_lengths.std()

print(f"min: {min_length}")
print(f"max: {max_length}")
print(f"avg: {avg_length}")
print(f"std: {stddev_length}")

min: 7
max: 1931
avg: 103.85474621766716
std: 98.8863525280012


### Bag Of Words

In [16]:
class BoWModel(L.LightningModule):
    def __init__(self, num_classes: int, learning_rate: float = 1e-4):
        super().__init__()
        self.save_hyperparameters()

        self.criterion = nn.CrossEntropyLoss()
        self.accuracy = Accuracy(task='multiclass', num_classes=num_classes)
        self.learning_rate = learning_rate

        self.model = nn.Sequential(
            nn.Linear(1500, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        review, labels = batch
        logits = self(review)
        loss = self.criterion(logits, labels)
        self.log('train_loss', loss)
        self.log('train_acc', self.accuracy(logits, labels), prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        review, labels = batch
        logits = self(review)
        loss = self.criterion(logits, labels)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', self.accuracy(logits, labels), prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

In [17]:
train_indices = np.random.rand(len(df)) > 0.2
train_data = torch.from_numpy(train_data_features).float()[train_indices]
train_targets = torch.from_numpy(df["rating"].values[train_indices]).long()

test_data = torch.from_numpy(train_data_features[~train_indices]).float()
test_targets = torch.from_numpy(df["rating"].values[~train_indices]).long()
train_dataset = TensorDataset(train_data, train_targets)
val_dataset = TensorDataset(test_data, test_targets)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

In [18]:
logger = TensorBoardLogger("logs", name="dev")
bow_model = BoWModel(num_classes=5)
trainer = L.Trainer(
    max_epochs=10, 
    callbacks=[L.pytorch.callbacks.EarlyStopping(monitor='val_loss', patience=7, strict=False, verbose=False, mode='min')],
    logger=logger
)

trainer.fit(bow_model, train_loader, val_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | criterion | CrossEntropyLoss   | 0      | train
1 | accuracy  | MulticlassAccuracy | 0      | train
2 | model     | Sequential         | 935 K  | train
---------------------------------------------------------
935 K     Trainable params
0         Non-trainable params
935 K     Total params
3.741     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/marcinjarczewski/Documents/studia/sem6/NLP-reviews/venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/marcinjarczewski/Documents/studia/sem6/NLP-reviews/venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
