In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# import utils
from torch.autograd import Variable
import torch.optim as optim

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import argparse
import copy
from scipy import stats

In [27]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
class CNN(nn.Module):
    def __init__(self, **kwargs):
        super(CNN, self).__init__()

        self.MODEL = kwargs["MODEL"]
        self.BATCH_SIZE = kwargs["BATCH_SIZE"]
        self.MAX_SENT_LEN = kwargs["MAX_SENT_LEN"]
        self.WORD_DIM = kwargs["WORD_DIM"]
        self.VOCAB_SIZE = kwargs["VOCAB_SIZE"]
        self.CLASS_SIZE = kwargs["CLASS_SIZE"]
        self.FILTERS = kwargs["FILTERS"]
        self.FILTER_NUM = kwargs["FILTER_NUM"]
        self.DROPOUT_PROB = kwargs["DROPOUT_PROB"]
        self.IN_CHANNEL = 1

        assert (len(self.FILTERS) == len(self.FILTER_NUM))

        # one for UNK and one for zero padding
        self.embedding = nn.Embedding(self.VOCAB_SIZE + 2, self.WORD_DIM, padding_idx=self.VOCAB_SIZE + 1)
        if self.MODEL == "static" or self.MODEL == "non-static" or self.MODEL == "multichannel":
            self.WV_MATRIX = kwargs["WV_MATRIX"]
            self.embedding.weight.data.copy_(torch.from_numpy(self.WV_MATRIX))
            if self.MODEL == "static":
                self.embedding.weight.requires_grad = False
            elif self.MODEL == "multichannel":
                self.embedding2 = nn.Embedding(self.VOCAB_SIZE + 2, self.WORD_DIM, padding_idx=self.VOCAB_SIZE + 1)
                self.embedding2.weight.data.copy_(torch.from_numpy(self.WV_MATRIX))
                self.embedding2.weight.requires_grad = False
                self.IN_CHANNEL = 2

        for i in range(len(self.FILTERS)):
            conv = nn.Conv1d(self.IN_CHANNEL, self.FILTER_NUM[i], self.WORD_DIM * self.FILTERS[i], stride=self.WORD_DIM)
            setattr(self, f'conv_{i}', conv)

        self.fc = nn.Linear(sum(self.FILTER_NUM), self.CLASS_SIZE)

    def get_conv(self, i):
        return getattr(self, f'conv_{i}')

    def forward(self, inp):
        x = self.embedding(inp).view(-1, 1, self.WORD_DIM * self.MAX_SENT_LEN)
        if self.MODEL == "multichannel":
            x2 = self.embedding2(inp).view(-1, 1, self.WORD_DIM * self.MAX_SENT_LEN)
            x = torch.cat((x, x2), 1)

        conv_results = [
            F.max_pool1d(F.relu(self.get_conv(i)(x)), self.MAX_SENT_LEN - self.FILTERS[i] + 1)
                .view(-1, self.FILTER_NUM[i])
            for i in range(len(self.FILTERS))]

        x = torch.cat(conv_results, 1)
        x = F.dropout(x, p=self.DROPOUT_PROB, training=self.training)
        x = self.fc(x)

        return x

In [30]:
def train(data, params):
    if params["MODEL"] != "rand":
        # load word2vec
        print("loading word2vec...")
        word_vectors = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

        wv_matrix = []
        for i in range(len(data["vocab"])):
            word = data["idx_to_word"][i]
            if word in word_vectors.vocab:
                wv_matrix.append(word_vectors.word_vec(word))
            else:
                wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))

        # one for UNK and one for zero padding
        wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
        wv_matrix.append(np.zeros(300).astype("float32"))
        wv_matrix = np.array(wv_matrix)
        params["WV_MATRIX"] = wv_matrix


    #device = 'cuda:{}'.format(params["GPU"]) if torch.cuda.is_available() and params["GPU"] != -1 else 'cpu'
    #model = CNN(**params).cuda(params["GPU"])
    model = CNN(**params).to(DEVICE)
    #model = CNN(**params).to(DEVICE)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    criterion = nn.CrossEntropyLoss()

    pre_dev_acc = 0
    max_dev_acc = 0
    max_test_acc = 0
    for e in range(params["EPOCH"]):
        data["train_x"], data["train_y"] = shuffle(data["train_x"], data["train_y"])

        for i in range(0, len(data["train_x"]), params["BATCH_SIZE"]):
            batch_range = min(params["BATCH_SIZE"], len(data["train_x"]) - i)

            batch_x = [[data["word_to_idx"][w] for w in sent] +
                       [params["VOCAB_SIZE"] + 1] * (params["MAX_SENT_LEN"] - len(sent))
                       for sent in data["train_x"][i:i + batch_range]]
            batch_y = [data["classes"].index(c) for c in data["train_y"][i:i + batch_range]]

            batch_x = Variable(torch.LongTensor(batch_x)).to(DEVICE)
            batch_y = Variable(torch.LongTensor(batch_y)).to(DEVICE)

            # batch_x = Variable(torch.LongTensor(batch_x))
            # batch_y = Variable(torch.LongTensor(batch_y))


            optimizer.zero_grad()
            model.train()
            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            loss.backward()
            nn.utils.clip_grad_norm_(parameters, max_norm=params["NORM_LIMIT"])
            optimizer.step()
            # print(i)

        dev_acc,_,_= test(data, model, params, mode="dev")
        test_acc,_,_ = test(data, model, params)
        print("epoch:", e + 1, "/ dev_acc:", dev_acc, "/ test_acc:", test_acc)

        if params["EARLY_STOPPING"] and (dev_acc-pre_dev_acc) < 0.001:
            print("early stopping by dev_acc!")
            break
        else:
            pre_dev_acc = dev_acc

        if dev_acc > max_dev_acc:
            max_dev_acc = dev_acc
            max_test_acc = test_acc
            best_model = copy.deepcopy(model)

    print("max dev acc:", max_dev_acc, "test acc:", max_test_acc)
    return best_model

In [31]:
def test(data, model, params, mode="test"):
    model.eval()

    if mode == "dev":
        x, y = data["dev_x"], data["dev_y"]
    elif mode == "test":
        x, y = data["test_x"], data["test_y"]

    x = [[data["word_to_idx"][w] if w in data["vocab"] else params["VOCAB_SIZE"] for w in sent] +
         [params["VOCAB_SIZE"] + 1] * (params["MAX_SENT_LEN"] - len(sent))
         for sent in x]

    # x = Variable(torch.LongTensor(x)).cuda(params["GPU"])
    x = Variable(torch.LongTensor(x)).to(DEVICE)
    y = [data["classes"].index(c) for c in y]
    y = Variable(torch.LongTensor(y)).to(DEVICE)

    pred = np.argmax(model(x).cpu().data.numpy(), axis=1)
    acc = sum([1 if p == y else 0 for p, y in zip(pred, y)]) / len(pred)

    with torch.no_grad():
        outputs = model(x)
        _, predicted = torch.max(outputs, 1)

    # Convert to numpy arrays
    y_true = y.cpu().numpy()
    y_pred = predicted.cpu().numpy()

    # Calculate accuracy
    acc = accuracy_score(y_true, y_pred)

    # Calculate weighted F1 score
    report = classification_report(y_true, y_pred, target_names=data["classes"], output_dict=True, zero_division=0)
    #report = classification_report(y_true, y_pred, target_names=data["classes"], output_dict=True)
    weighted_f1 = report['weighted avg']['f1-score']
    marco_f1 = report["macro avg"]['f1-score']
    print(report)
    return acc, weighted_f1, marco_f1

In [32]:
def read_ag_data(seed):
    data = {}

    def read(mode):
        x,y = [], []

        # x_train, y_train = [], []
        # x_test, y_test =[],[]
        with open("drive/MyDrive/CPSC_577_FP/ag_sentences_clean.txt", "r", encoding="utf-8") as f:
            for line in f:
              if line[-1] == "\n":
                line = line[:-1]
              # y.append(line.split()[0].split(":")[0])
              x.append(line.split())

        with open("drive/MyDrive/CPSC_577_FP/ag_labels.txt", "r", encoding="utf-8") as d:
            for line in d:
              if line[-1] == "\n":
                line = line[:-1]
              y.append(line.split()[2])

        x, y = shuffle(x, y, random_state=seed)

        x_train=x[7600:]
        y_train=y[7600:]
        x_test=x[0:7600]
        y_test=y[0:7600]

        x_train, y_train = shuffle(x_train, y_train)
        x_test, y_test = shuffle(x_test, y_test)


        if mode == "train":
            dev_idx = len(x) // 10
            data["dev_x"], data["dev_y"] = x_train[:dev_idx], y_train[:dev_idx]
            data["train_x"], data["train_y"] = x_train[dev_idx:], y_train[dev_idx:]
        else:
            data["test_x"], data["test_y"] = x_test, y_test

    read("train")
    read("test")

    return data

In [33]:
def read_r8_data(seed):
    data = {}

    def read(mode):
        x,y = [], []

        # x_train, y_train = [], []
        # x_test, y_test =[],[]
        with open("drive/MyDrive/CPSC_577_FP/r8_sentences_clean.txt", "r", encoding="utf-8") as f:
            for line in f:
              if line[-1] == "\n":
                line = line[:-1]
              # y.append(line.split()[0].split(":")[0])
              x.append(line.split())

        with open("drive/MyDrive/CPSC_577_FP/r8_labels.txt", "r", encoding="utf-8") as d:
            for line in d:
              if line[-1] == "\n":
                line = line[:-1]
              y.append(line.split()[2])


        x, y = shuffle(x, y, random_state=seed)

        x_train=x[0:5485]
        y_train=y[0:5485]
        x_test=x[5485:]
        y_test=y[5485:]

        x_train, y_train = shuffle(x_train, y_train)
        x_test, y_test = shuffle(x_test, y_test)


        if mode == "train":
            dev_idx = len(x) // 10
            data["dev_x"], data["dev_y"] = x_train[:dev_idx], y_train[:dev_idx]
            data["train_x"], data["train_y"] = x_train[dev_idx:], y_train[dev_idx:]
        else:
            data["test_x"], data["test_y"] = x_test, y_test

    read("train")
    read("test")

    return data

In [34]:
def read_as_data(seed):
    data = {}

    def read(mode):
        x,y = [], []

        # x_train, y_train = [], []
        # x_test, y_test =[],[]
        with open("drive/MyDrive/CPSC_577_FP/twitter_asian_prejudice_sentences_clean.txt", "r", encoding="utf-8") as f:
            for line in f:
              if line[-1] == "\n":
                line = line[:-1]
              # y.append(line.split()[0].split(":")[0])
              x.append(line.split())

        with open("drive/MyDrive/CPSC_577_FP/twitter_asian_prejudice_labels.txt", "r", encoding="utf-8") as d:
            for line in d:
              if line[-1] == "\n":
                line = line[:-1]
              y.append(line.split()[0])




        x, y = shuffle(x, y, random_state=seed)

        x_train=x[0:16000]
        y_train=y[0:16000]
        x_test=x[16000:]
        y_test=y[16000:]

        x_train, y_train = shuffle(x_train, y_train)
        x_test, y_test = shuffle(x_test, y_test)


        if mode == "train":
            dev_idx = len(x) // 10
            data["dev_x"], data["dev_y"] = x_train[:dev_idx], y_train[:dev_idx]
            data["train_x"], data["train_y"] = x_train[dev_idx:], y_train[dev_idx:]
        else:
            data["test_x"], data["test_y"] = x_test, y_test

    read("train")
    read("test")

    return data

In [35]:
def main1(seed):
    # parser = argparse.ArgumentParser(description="-----[CNN-classifier]-----")
    # parser.add_argument("--mode", default="train", help="train: train (with test) a model / test: test saved models")
    # parser.add_argument("--model", default="rand", help="available models: rand, static, non-static, multichannel")
    # parser.add_argument("--dataset", default="TREC", help="available datasets: MR, TREC")
    # parser.add_argument("--save_model", default=False, action='store_true', help="whether saving model or not")
    # parser.add_argument("--early_stopping", default=False, action='store_true', help="whether to apply early stopping")
    # parser.add_argument("--epoch", default=100, type=int, help="number of max epoch")
    # parser.add_argument("--learning_rate", default=1.0, type=float, help="learning rate")
    # parser.add_argument("--gpu", default=-1, type=int, help="the number of gpu to be used")

    # options = parser.parse_args()
    # ag_data = read_ag_data()
    r8_data = read_r8_data(seed)
    data = r8_data
    print(len(data["train_x"]))
    data["vocab"] = sorted(list(set([w for sent in data["train_x"] + data["dev_x"] + data["test_x"] for w in sent])))
    data["classes"] = sorted(list(set(data["train_y"])))
    data["word_to_idx"] = {w: i for i, w in enumerate(data["vocab"])}
    data["idx_to_word"] = {i: w for i, w in enumerate(data["vocab"])}

    params = {
        "MODEL": "rand",
        "DATASET": "",
        #"SAVE_MODEL": options.save_model,
        "EARLY_STOPPING": True,
        "EPOCH": 50, # 50
        "LEARNING_RATE": 2e-2,
        "MAX_SENT_LEN": max([len(sent) for sent in data["train_x"] + data["dev_x"] + data["test_x"]]),
        "BATCH_SIZE": 50,
        "WORD_DIM": 300,
        "VOCAB_SIZE": len(data["vocab"]),
        "CLASS_SIZE": len(data["classes"]),
        "FILTERS": [3, 4, 5],
        "FILTER_NUM": [100, 100, 100],
        "DROPOUT_PROB": 0.5,
        "NORM_LIMIT": 3,
        "GPU": 1
    }

    model = train(data, params)
    test_acc,test_f1, marco_f1 = test(data, model, params)
    print("test acc:", test_acc, "test weighted f1:", test_f1,"Macro F1", marco_f1)
    return test_acc, test_f1, marco_f1


In [36]:
def main2(seed):
    #ag_data = read_ag_data()
    as_data = read_as_data(seed)
    data = as_data
    print(len(data["train_x"]))
    data["vocab"] = sorted(list(set([w for sent in data["train_x"] + data["dev_x"] + data["test_x"] for w in sent])))
    data["classes"] = sorted(list(set(data["train_y"])))
    data["word_to_idx"] = {w: i for i, w in enumerate(data["vocab"])}
    data["idx_to_word"] = {i: w for i, w in enumerate(data["vocab"])}

    params = {
        "MODEL": "rand",
        "DATASET": "",
        #"SAVE_MODEL": options.save_model,
        "EARLY_STOPPING": True,
        "EPOCH": 50, # 50
        "LEARNING_RATE": 1e-3, #2e-2
        "MAX_SENT_LEN": max([len(sent) for sent in data["train_x"] + data["dev_x"] + data["test_x"]]),
        "BATCH_SIZE": 50,
        "WORD_DIM": 300,
        "VOCAB_SIZE": len(data["vocab"]),
        "CLASS_SIZE": len(data["classes"]),
        "FILTERS": [3, 4, 5],
        "FILTER_NUM": [100, 100, 100],
        "DROPOUT_PROB": 0.5,
        "NORM_LIMIT": 3,
        "GPU": 1
    }

    model = train(data, params)
    test_acc,test_f1, marco_f1 = test(data, model, params)
    print("test acc:", test_acc, "test weighted f1:", test_f1,"Macro F1", marco_f1)
    return test_acc, test_f1, marco_f1


In [37]:
def calculate_mean_and_std(data):
    mean = np.mean(data)
    std_dev = np.std(data)

    return mean, std_dev

In [15]:
seeds=[33, 15, 86, 109, 78]

Times = 5
accuracies = []
weighted_f1s = []
marco_f1s = []
for i in range(Times):
  seed=seeds[i]
  test_acc, test_f1, marco_f1 = main1(seed)
  accuracies.append(test_acc)
  weighted_f1s.append(test_f1)
  marco_f1s.append(marco_f1)


test_acc,test_acc_std = calculate_mean_and_std(accuracies)
test_f1,test_f1_std = calculate_mean_and_std(weighted_f1s)
marco_f1,marco_f1_std  = calculate_mean_and_std(marco_f1s)

4718
{'acq': {'precision': 0.5538922155688623, 'recall': 0.8894230769230769, 'f1-score': 0.6826568265682655, 'support': 208}, 'crude': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 35}, 'earn': {'precision': 0.8775981524249422, 'recall': 0.9069212410501193, 'f1-score': 0.892018779342723, 'support': 419}, 'grain': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7}, 'interest': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 26}, 'money-fx': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 30}, 'ship': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}, 'trade': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 27}, 'accuracy': 0.7366362451108214, 'macro avg': {'precision': 0.17893629599922556, 'recall': 0.22454303974664952, 'f1-score': 0.19683445073887357, 'support': 767}, 'weighted avg': {'precision': 0.6296260843603313, 'recall': 0.7366362451108214, 'f1-score': 0.6724230618915257, 'support': 767}}
{'ac

In [16]:
print("test acc:", test_acc,'std:',test_acc_std ,"test weighted f1:", test_f1,"std:",test_f1_std, "Macro F1:", marco_f1,'std:',marco_f1_std)

test acc: 0.865874828688899 std: 0.07149323439486739 test weighted f1: 0.8399342060878414 std: 0.09634380635460459 Macro F1: 0.546367858863642 std: 0.17507308250350517


In [24]:
seeds=[33, 15, 86, 109, 78]
Times = 5
accuracies = []
weighted_f1s = []
marco_f1s = []
for i in range(Times):
  seed=seeds[i]
  test_acc, test_f1, marco_f1 = main2(seed)
  accuracies.append(test_acc)
  weighted_f1s.append(test_f1)
  marco_f1s.append(marco_f1)


test_acc,test_acc_std = calculate_mean_and_std(accuracies)
test_f1,test_f1_std = calculate_mean_and_std(weighted_f1s)
marco_f1,marco_f1_std  = calculate_mean_and_std(marco_f1s)

14000
{'counter_speech': {'precision': 0.008670520231213872, 'recall': 0.21428571428571427, 'f1-score': 0.016666666666666666, 'support': 14}, 'discussion_of_eastasian_prejudice': {'precision': 0.07017543859649122, 'recall': 0.23076923076923078, 'f1-score': 0.10762331838565022, 'support': 104}, 'entity_directed_criticism': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 147}, 'entity_directed_hostility': {'precision': 0.3151515151515151, 'recall': 0.1489971346704871, 'f1-score': 0.20233463035019458, 'support': 349}, 'none_of_the_above': {'precision': 0.7925021795989537, 'recall': 0.6558441558441559, 'f1-score': 0.7177260165811291, 'support': 1386}, 'accuracy': 0.494, 'macro avg': {'precision': 0.2372999307156348, 'recall': 0.24997924711391759, 'f1-score': 0.20887012639672814, 'support': 2000}, 'weighted avg': {'precision': 0.6079077663046504, 'recall': 0.49400000000000005, 'f1-score': 0.538404601709552, 'support': 2000}}
{'counter_speech': {'precision': 0.00582241630276564

In [25]:
print("test acc:", test_acc,'std:',test_acc_std ,"test weighted f1:", test_f1,"std:",test_f1_std, "Macro F1:", marco_f1,'std:',marco_f1_std)

test acc: 0.6787 std: 0.007327004845091878 test weighted f1: 0.5539507741388505 std: 0.01046713158471731 Macro F1: 0.16588329502285074 std: 0.0033537545549390457
