In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import os

os.chdir('/Users/madsbirch/Documents/4_semester/mlops/mlops-sentiment-analysis')
print("Current working directory: {0}".format(os.getcwd()))

import gzip, json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import BertModel, BertTokenizer



from src.data.AmazonReviewData import AmazonReviewsDataset
from src.data.make_dataset_temp import get_pandas_DF, sentiment_map, preprocess_data

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# Set device (mps is specific to mac with M1 processor)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# path to raw data
raw_data_path = "data/raw/"

Current working directory: /Users/madsbirch/Documents/4_semester/mlops/mlops-sentiment-analysis
Using device: mps


### 1 Data exporation

The data can be downloaded from: http://jmcauley.ucsd.edu/data/amazon/links.html

In [3]:
df = get_pandas_DF(raw_data_path+'reviews_Automotive_5.json.gz')
# subset columns and rename to more intuitive names 
df = df[['overall', 'reviewText']]
df = df.rename(columns={'overall': 'sentiment', 'reviewText': 'review'})
  
# do sentiment mapping
df.sentiment = df.sentiment.apply(sentiment_map)
df.head()

Unnamed: 0,sentiment,review
0,2,I needed a set of jumper cables for my new car...
1,2,"These long cables work fine for my truck, but ..."
2,2,Can't comment much on these since they have no...
3,2,I absolutley love Amazon!!! For the price of ...
4,2,I purchased the 12' feet long cable set and th...


Classes are highly imbalanced!

In [4]:
df.sentiment.value_counts()

2    17895
1     1430
0     1148
Name: sentiment, dtype: int64

### Train model

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_set, test_set = preprocess_data(raw_data_path, tokenizer=tokenizer, max_len = 256, train_split=0.7)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=0)

valid_loader = DataLoader(test_set, batch_size=32, num_workers=0)

In [6]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes: int, dropout: float, bert_out_dim = 768):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("bert-base-cased")
    self.drop = nn.Dropout(dropout)
    self.output = nn.Linear(bert_out_dim, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, return_dict=False
        )
    out = self.drop(pooled_output)
    return self.output(out)


In [7]:
model = SentimentClassifier(n_classes=3, dropout=0.2).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
epochs = 5

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
mem_params = sum([param.nelement()*param.element_size() for param in model.parameters()])
mem_bufs = sum([buf.nelement()*buf.element_size() for buf in model.buffers()])
mem = mem_params + mem_bufs # in bytes
mem

433258508

In [9]:
for epoch in range(epochs):
  print(f'[EPOCH]: {epoch:3d}')
  
  model.train()
  for batch in tqdm(train_loader):
    
    # move data to device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    # get model outout and calc loss
    output = model(input_ids, attention_mask)
    loss = criterion(output, labels)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  1%|          | 4/448 [01:51<3:49:01, 30.95s/it]