In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import os

os.chdir('/Users/madsbirch/Documents/4_semester/mlops/mlops-sentiment-analysis')
print("Current working directory: {0}".format(os.getcwd()))

import gzip, json
import random
import numpy as np
import pandas as pd



import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import BertModel, BertTokenizer


from sklearn.model_selection import train_test_split

from src.data.AmazonReviewData import AmazonReviewsDataset
from src.data.make_dataset_temp import get_pandas_DF

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# Set device (mps is specific to mac with M1 processor)
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# path to raw data
raw_data_path = "data/raw/"

Current working directory: /Users/madsbirch/Documents/4_semester/mlops/mlops-sentiment-analysis
Using device: mps


### 1 Data exporation

The data can be downloaded from: http://jmcauley.ucsd.edu/data/amazon/links.html

In [6]:
df = get_pandas_DF(raw_data_path+'reviews_Automotive_5.json.gz')
# subset columns and rename to more intuitive names 
df = df[['overall', 'reviewText']]
df = df.rename(columns={'overall': 'sentiment', 'reviewText': 'review'})
  
# do sentiment mapping
df.sentiment = df.sentiment.apply(sentiment_map)
df.head()

Unnamed: 0,sentiment,review
0,2,I needed a set of jumper cables for my new car...
1,2,"These long cables work fine for my truck, but ..."
2,2,Can't comment much on these since they have no...
3,2,I absolutley love Amazon!!! For the price of ...
4,2,I purchased the 12' feet long cable set and th...


Classes are highly imbalanced!

In [7]:
df.sentiment.value_counts()

2    17895
1     1430
0     1148
Name: sentiment, dtype: int64

### Train model

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_set, test_set = preprocess_data(tokenizer=tokenizer, max_len = 256, train_split=0.7)

train_loader = DataLoader(train_set, batch_size=256, shuffle=True)

valid_loader = DataLoader(test_set, batch_size=256)

In [None]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes: int, dropout: float, bert_out_dim = 768):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("distilbert-base-uncased")
    self.drop = nn.Dropout(dropout)
    self.output = nn.Linear(bert_out_dim, n_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids, attention_mask)
    last_hidden_state = outputs[0]
    pooled_output = torch.cat(tuple([last_hidden_state[:, i] for i in [-4, -3, -2, -1]]), dim=-1)
    
    return self.output(pooled_output)


In [None]:
for epoch in range(epochs):
  # TRAIN
  model.train()
  train_loop = tqdm(train_loader)
  for batch in train_loop:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    output = model(input_ids, attention_mask)
    loss = criterion(output, labels)
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)
    optimizer.step()

    train_loop.set_description(f"Training Epoch: {epoch}")
    train_loop.set_postfix(loss=loss.item())