In [1]:
!pip install pytorch-transformers

Collecting pytorch-transformers
  Using cached https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl
Collecting sacremoses (from pytorch-transformers)
  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
Collecting boto3 (from pytorch-transformers)
  Using cached https://files.pythonhosted.org/packages/00/34/75b2d38f0647cfbdfd00c62c1d3e4210f6c40fb8ff66a9a644c439e849ab/boto3-1.11.1-py2.py3-none-any.whl
Collecting requests (from pytorch-transformers)
  Using cached https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl
Collecting torch>=1.0.0 (from pytorch-transformers)


  Could not find a version that satisfies the requirement torch>=1.0.0 (from pytorch-transformers) (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)
No matching distribution found for torch>=1.0.0 (from pytorch-transformers)


In [None]:
import pandas as pd
import os
import torch
from tqdm import tqdm
from torch.utils.data import Dataset
from pytorch_transformers import BertTokenizer, BertConfig, BertForSequenceClassification
%matplotlib inline

In [None]:
def rpad(array, n=70):
    current_len = len(array)
    if current_len > n: 
      return array[:n]
    extra = n - current_len
    return array + ([0] * extra)

class HumourDataset(Dataset):
    def __init__(self, split="train", model_name="original", length=66):
        print(f"Loading humour {model_name} {split} set")
        aplit_at = int(0.9 * len(df))
        self.df = df[:aplit_at][[model_name,"meanGrade"]] if split=="train" else df[aplit_at:][[model_name,"meanGrade"]]

        print("Tokenizing")
        self.data = [
            (
                rpad(tokenizer.encode("[CLS] " + row[model_name] + " [SEP]"), n=length),
                round(row["meanGrade"])
            )
            for indx, row in self.df.iterrows()
        ]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X, y = self.data[index]
        X = torch.tensor(X)
        return X, y

def train_one_epoch(model, lossfn, optimizer, dataset, batch_size=32):
    print("training...")
    generator = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=False
    )
    model.train()
    train_loss, train_acc = 0.0, 0.0
    for batch, labels in tqdm(generator):
        batch, labels = batch.to(device), labels.to(device)
        optimizer.zero_grad()
        loss, logits = model(batch, labels=labels)
        err = lossfn(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        pred_labels = torch.argmax(logits, axis=1)
        train_acc += (pred_labels == labels).sum().item()
    train_loss /= len(dataset)
    train_acc /= len(dataset)
    print("Done.")
    return train_loss, train_acc

def evaluate_one_epoch(model, lossfn, optimizer, dataset, batch_size=32):
    print("testing...")
    generator = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=True
    )
    model.eval()
    loss, acc = 0.0, 0.0
    with torch.no_grad():
        for batch, labels in tqdm(generator):
            batch, labels = batch.to(device), labels.to(device)
            logits = model(batch)[0]
            error = lossfn(logits, labels)
            loss += error.item()
            pred_labels = torch.argmax(logits, axis=1)
            acc += (pred_labels == labels).sum().item()
    loss /= len(dataset)
    acc /= len(dataset)
    print("Done.")
    return loss, acc

def train(
    model_name="original",
    bert="bert-large-uncased",
    epochs=30,
    batch_size=32,
    length=66,
    save=True
):
    trainset = HumourDataset(split="train", model_name=model_name, length=length)
    testset = HumourDataset(split="test", model_name=model_name, length=length)
    
    config = BertConfig.from_pretrained(bert)
    config.num_labels = 4
    model = BertForSequenceClassification.from_pretrained(bert, config=config)

    model = model.to(device)
    lossfn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(epochs):
        print(f"\nepoch={epoch+1}")
        train_loss, train_acc = train_one_epoch(
            model, lossfn, optimizer, trainset, batch_size=batch_size
        )
        test_loss, test_acc = evaluate_one_epoch(
            model, lossfn, optimizer, testset, batch_size=batch_size
        )
        print(f"train_loss={train_loss:.4f}, test_loss={test_loss:.4f}")
        print(f"train_acc={train_acc:.3f}, test_acc={test_acc:.3f}")
        if save:
            torch.save(model, f"{bert}__binary__{model_name}__e{epoch}.pickle")

    print("Done.")

In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

bert = "bert-large-uncased"
epoch = 1
batch_size = 29

In [None]:
print("Loading the tokenizer")
tokenizer = BertTokenizer.from_pretrained(bert)

df = pd.read_csv('sample_data/train_cleaned.csv', encoding="ISO-8859-1")
df = df.dropna(how="any").reset_index(drop=True)

In [None]:
train(bert=bert,model_name="original",epochs=epoch,length=66,batch_size=batch_size)
train(bert=bert,model_name="edit",epochs=epoch,length=10,batch_size=batch_size)
train(bert=bert,model_name="edited",epochs=epoch,length=66,batch_size=batch_size)

In [None]:
print("Loading the \"original\" model")
original_model = torch.load(f"models/{bert}__binary__original__e{epoch}.pickle")
original_model.eval()
original_model.to(device)

print("Loading the \"edit\" model")
edit_model = torch.load(f"models/{bert}__binary__edit__e{epoch}.pickle")
edit_model.eval()
edit_model.to(device)

print("Loading the \"edited\" model")
edited_model = torch.load(f"models/{bert}__binary__edited__e{epoch}.pickle")
edited_model.eval()
edited_model.to(device)