<a href="https://colab.research.google.com/github/Iispar/review-summary-backend/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The model

This is the colab file for my machine learning model that predicts the sentiment value (stars) of an review dataset. Here you can see how the model has been made with explanations.

The dataset used is [amazon_reviews_multi](https://huggingface.co/datasets/amazon_reviews_multi). The model is a pretty basic one but 

In [2]:
!pip3 install -q transformers datasets evaluate
import datasets
import sklearn.feature_extraction
import torch
import transformers
import numpy as np
import evaluate

# Preprocessing

The dataset includes reviews from multiple languages so we only import the english ones. The dataset also includes alot of useless data for us, we only need the reviews and their ratings so lets process everything else out.

In [3]:
dataset = datasets.load_dataset('amazon_reviews_multi', name='en'); # imports the dataset.
# check it works
print(dataset);



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})


In [4]:
dataset = dataset.shuffle() # shuffle the dataset for safety.
dataset = dataset.remove_columns(['review_id', 'product_id', 'reviewer_id', 'language', 'product_category']) # removes everything that we don't need
dataset = dataset.rename_column("stars", "label") # rename stars to label so it is a bit more understandable
# an error was coming up because of the labels were 1-5 and not 0-4 so let's change that for all.
# at the same time lets add the title to the start of the review with an :.

def addTitle_and_changeLables(example):
  example["label"] = example["label"] - 1; 
  example["review_body"] = f'{example["review_title"]}: {example["review_body"]}';
  return example
dataset = dataset.map(addTitle_and_changeLables) # map the function to all.
dataset = dataset.remove_columns(['review_title']) # now we can also remove the title
print(dataset) # let's check that it worked.
print(dataset["train"][3])

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'review_body'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['label', 'review_body'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'review_body'],
        num_rows: 5000
    })
})
{'label': 0, 'review_body': 'Never buy again: Dint not came complete I’m missing that cups verry bad products'}


# Vectorization

In [5]:
# vectorizes one item
def vectorize_item(item):
    vectorized = vectorizer.transform([item["review_body"]]); # vectorizes an item. Initialized below...
    non_zero_features = vectorized.nonzero()[1]; # get the nonzeros and we take only the columns of the nonzeros because our matrix is only one row.
    non_zero_features += 1; # index zero is for padding so let's avoid it by adding 1 to all.

    return {"input_ids":non_zero_features} 

In [6]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer( # get the vectorizer. try TfidfVectorizer also... 
    binary = True,
    max_features = 20000, # amount of words
    token_pattern = r"(?u)\b\w+\b", # take in one letter words also
    )

texts=[item["review_body"] for item in dataset["train"]]; # get all texts from train
vectorizer.fit(texts); # fitting the vectorizer

# vectorize the whole dataset.
dset_tokenized = dataset.map(vectorize_item,num_proc=4);

Map (num_proc=4):   0%|          | 0/200000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

# Padding and batching

In [7]:
def collator(list_of_items):
    allLabels = [item["label"] for item in list_of_items]; # list of all labels.
    batch = {"labels": torch.tensor(allLabels)}; # create a tenstor for the item (batch)
    tensors = [];
    max_len = max(len(item["input_ids"]) for item in list_of_items); # longest example in the batch. Pad to here.
    for item in list_of_items:
        ids = torch.tensor(item["input_ids"]); # input ids to tensor
        padded = torch.nn.functional.pad(ids,(0,max_len-ids.shape[0])); # actual padding. Pads ids, from + to max with 0.
        tensors.append(padded); # appended ids to tensors
    batch["input_ids"] = torch.vstack(tensors); # stacks items as they are now same len. Now these are matrixes.
    return batch;

# check it works
batch=collator([dset_tokenized["train"][1],dset_tokenized["train"][16]])
print("Shape of labels:",batch["labels"].shape)
print("Shape of input_ids:",batch["input_ids"].shape)
print(batch["labels"])
print(batch["input_ids"])

Shape of labels: torch.Size([2])
Shape of input_ids: torch.Size([2, 43])
tensor([3, 4])
tensor([[  157,   513,   966,   986,  1842,  1879,  2711,  2936,  3441,  4129,
          4262,  6935,  7785,  8067,  8167,  8342,  8720,  9153,  9383,  9400,
         10269, 10502, 10583, 11988, 11992, 12002, 12410, 12617, 12723, 13442,
         13957, 14008, 14057, 15014, 15520, 15703, 17232, 17700, 17705, 17784,
         17967, 19298, 19930],
        [ 7195,  8720,  9400, 11586, 12613, 13666, 19487,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]])


# Machine learning model

## config

In [8]:
# needs a config, we wil just pass it.
class MLPConfig(transformers.PretrainedConfig):
    pass;

# model
class MLP(transformers.PreTrainedModel):
      config_class = MLPConfig; # sets config
      #initilazition
      def __init__(self,config):
        super().__init__(config); # call the super with out config which is now pass..
        self.vocab_size = config.vocab_size; # embedding matrix row count
        # Build embedding of vocab size +1 x hidden size. +1 again because of padding.
        self.embedding = torch.nn.Embedding(num_embeddings=self.vocab_size+1,embedding_dim=config.hidden_size,padding_idx=0);
        torch.nn.init.uniform_(self.embedding.weight.data,-0.001,0.001); # initialization of the embedding values
        self.output = torch.nn.Linear(in_features=config.hidden_size,out_features=config.nlabels); # output layer is the size of the labels x hidden size.

      # forward
      def forward(self,input_ids,labels=None):
        embedded = self.embedding(input_ids); # sum up all the embeddings
        embedded_summed = torch.sum(embedded,dim=1); # sum up across word dimension
        projected = torch.tanh(embedded_summed); # non-linearity
        logits = self.output(projected); # apply the outer layer
      

        ## calculates the loss
        if labels is not None:
            # calculates the loss.
            loss = torch.nn.CrossEntropyLoss();
            return (loss(logits,labels),logits);
        else:
            # if no labels, just return the logits
            return (logits,);
  
# config
mlp_config = MLPConfig(vocab_size=len(vectorizer.vocabulary_),hidden_size=20,nlabels=5); # hidden size to start 20, and labels 5 (1-5 stars)
     

## training

In [9]:
# training

# Set training arguments
trainer_args = transformers.TrainingArguments(
    "mlp_checkpoints", #save checkpoints here
    evaluation_strategy = "steps",
    logging_strategy = "steps",
    eval_steps = 500,
    logging_steps = 500,
    learning_rate = 1e-4,
    max_steps = 20000,
    load_best_model_at_end = True,
    per_device_train_batch_size = 128
)


# evaluating
accuracy = evaluate.load("accuracy");
def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels;
    predictions = np.argmax(outputs, axis=-1); #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels);

# actual training
mlp = MLP(mlp_config); # Make a the actual model  
early_stopping = transformers.EarlyStoppingCallback(5); # stop training if the eval loss is not getting better.

# params
trainer = transformers.Trainer(
    model = mlp,
    args = trainer_args,
    train_dataset = dset_tokenized["train"],
    eval_dataset = dset_tokenized["test"],
    compute_metrics = compute_accuracy,
    data_collator = collator,
    callbacks = [early_stopping]
)

# FINALLY!
trainer.train();




Step,Training Loss,Validation Loss,Accuracy
500,1.5337,1.438504,0.5184
1000,1.3455,1.267215,0.5432
1500,1.2087,1.166635,0.5546
2000,1.1203,1.108288,0.5604
2500,1.0733,1.072776,0.567
3000,1.041,1.048276,0.5724
3500,1.0087,1.031868,0.573
4000,0.9938,1.01989,0.5784
4500,0.9867,1.011491,0.5798
5000,0.9679,1.004926,0.5824
