In [1]:
!pip install pytorch-transformers



In [0]:
import pandas as pd
import os
import torch
from tqdm import tqdm
from torch.utils.data import Dataset
from pytorch_transformers import BertTokenizer, BertConfig, BertForSequenceClassification
%matplotlib inline

In [3]:
print("Loading the tokenizer")
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

print("Loading Twitter Dataset")
df = pd.read_csv('sample_data/train.csv', encoding="ISO-8859-1")
df = df.dropna(how="any").reset_index(drop=True)
df = df.drop(['ItemID'], axis=1)

Loading the tokenizer
Loading Twitter Dataset


In [0]:
def rpad(array, n=70):
    current_len = len(array)
    if current_len > n:
        return array[:n]
    extra = n - current_len
    return array + ([0] * extra)

class TwitterDataset(Dataset):
    def __init__(self, split="train"):
        print(f"Loading Twitter {split} set")
        aplit_at = int(0.9 * len(df))
        if split == "train":
            self.df = df[:aplit_at]
        else:
            self.df = df[aplit_at:]

        print("Tokenizing")
        self.data = [
            (
                rpad(tokenizer.encode("[CLS] " + row["SentimentText"] + " [SEP]"), n=66),
                row["Sentiment"]
            )
            for indx, row in self.df.iterrows()
        ]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X, y = self.data[index]
        X = torch.tensor(X)
        return X, y

In [0]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_one_epoch(model, lossfn, optimizer, dataset, batch_size=32):
    print("training...")
    generator = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False
    )
    model.train()
    train_loss, train_acc = 0.0, 0.0
    for batch, labels in tqdm(generator):
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        loss, logits = model(batch, labels=labels)
        err = lossfn(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        pred_labels = torch.argmax(logits, axis=1)
        train_acc += (pred_labels == labels).sum().item()
    train_loss /= len(dataset)
    train_acc /= len(dataset)
    print("Done.")
    return train_loss, train_acc

def evaluate_one_epoch(model, lossfn, optimizer, dataset, batch_size=32):
    print("testing...")
    generator = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=True
    )
    model.eval()
    loss, acc = 0.0, 0.0
    with torch.no_grad():
        for batch, labels in tqdm(generator):
            batch, labels = batch.to(device), labels.to(device)
            logits = model(batch)[0]
            error = lossfn(logits, labels)
            loss += error.item()
            pred_labels = torch.argmax(logits, axis=1)
            acc += (pred_labels == labels).sum().item()
    loss /= len(dataset)
    acc /= len(dataset)
    print("Done.")
    return loss, acc

def train(
    bert="bert-large-uncased",
    epochs=30,
    batch_size=32,
    save=True
):
    trainset = TwitterDataset("train")
    testset = TwitterDataset("test")
    
    config = BertConfig.from_pretrained(bert)
    model = BertForSequenceClassification.from_pretrained(bert, config=config)

    model = model.to(device)
    lossfn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(1, epochs):
        print(f"epoch={epoch}")
        train_loss, train_acc = train_one_epoch(
            model, lossfn, optimizer, trainset, batch_size=batch_size
        )
        test_loss, test_acc = evaluate_one_epoch(
            model, lossfn, optimizer, testset, batch_size=batch_size
        )
        print(f"train_loss={train_loss:.4f}, test_loss={test_loss:.4f}")
        print(f"train_acc={train_acc:.3f}, test_acc={test_acc:.3f}")
        if save:
            torch.save(model, f"{bert}__binary__e{epoch}.pickle")

    print("Done.")

In [6]:
train(epochs=2)

Loading Twitter train set
Tokenizing
Loading Twitter test set
Tokenizing


  0%|          | 0/2813 [00:00<?, ?it/s]

epoch=1
training...


100%|██████████| 2813/2813 [43:04<00:00,  1.33it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

Done.
testing...


100%|██████████| 313/313 [01:25<00:00,  4.23it/s]
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Done.
train_loss=0.0131, test_loss=0.0116
train_acc=0.809, test_acc=0.834


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Done.
