## Installing requirements

In [None]:
!pip install -r requirements

## - Import

In [None]:
from moduls import *

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('torch version:',torch.__version__)
print('device:', device)

## - Parameters

In [None]:
batch_size = 16

# Preprocessing
word_max_len = 128

# Model params
h1 = 768
h2 = 128
h3 = 128
h4 = 64


drop_out_rate = 0.2
# Training
epochs = 30  
learning_rate = 3e-6

In [None]:
# data paths
train_data_path1 = 'Data/agr_en_train.csv'
train_data_path2 = 'Data/trac2_eng_train.csv' 
valid_data_path = 'Data/agr_en_dev.csv' 
test_data_path = 'Data/trac2_eng_dev.csv' 
cag_gptData_path = 'Data/gpt_CAG_data.csv'
oag_gptData_path = 'Data/OAG_data.csv'

# reading datasets
cag_gptData = pd.read_csv(cag_gptData_path)
oag_gptData = pd.read_csv(oag_gptData_path)
df_train1 = pd.read_csv(train_data_path1, names = ['SOURCE', 'TEXT', 'AGGRESSION_CLASS'],
                                          usecols=['TEXT', 'AGGRESSION_CLASS'])
df_val = pd.read_csv(valid_data_path, names = ['SOURCE', 'TEXT', 'AGGRESSION_CLASS'],
                                        usecols=['TEXT', 'AGGRESSION_CLASS'])
df_train2 = pd.read_csv(train_data_path2, names = ['SOURCE', 'TEXT', 'AGGRESSION_CLASS', "_"],
                                          usecols=['TEXT', 'AGGRESSION_CLASS'])
df_test = pd.read_csv(test_data_path, names = ['SOURCE', 'TEXT', 'AGGRESSION_CLASS', "_"],
                                        usecols=['TEXT', 'AGGRESSION_CLASS'])

#defining data sets to merge with train data set
df_train_for_concat = df_train2[df_train2["AGGRESSION_CLASS"] != "NAG"]
cag_gpt_for_concat = cag_gptData[cag_gptData["AGGRESSION_CLASS"] == "CAG"]
oag_gpt_for_concat = oag_gptData[oag_gptData["AGGRESSION_CLASS"] == "OAG"]

df_train2.drop(df_train2.index[0], inplace=True)
df_test.drop(df_test.index[0], inplace=True)

In [None]:
df_train = pd.concat([df_train1, df_train_for_concat], ignore_index=True)
df_train = pd.concat([df_train, cag_gpt_for_concat], ignore_index=True)
df_train = pd.concat([df_train, oag_gpt_for_concat], ignore_index=True)

In [None]:
print(f"df_train vale counts: {df_train['AGGRESSION_CLASS'].value_counts()}")
print("__________________________________________")

print(f"\ndf_val vale counts: {df_val['AGGRESSION_CLASS'].value_counts()}")
print("__________________________________________")

print(f"\ndf_test vale counts: {df_test['AGGRESSION_CLASS'].value_counts()}")
print("__________________________________________")


## - Data Preprocessing

In [None]:
# Shuffle data
np.random.seed(41)
train_shuffled = df_train.reindex(np.random.permutation(df_train.index))
valid_shuffled = df_val.reindex(np.random.permutation(df_val.index))
test_shuffled = df_test.reindex(np.random.permutation(df_test.index))


In [None]:
#------------------Train---------------------
CAG = train_shuffled[train_shuffled['AGGRESSION_CLASS'] == 'CAG']
OAG = train_shuffled[train_shuffled['AGGRESSION_CLASS'] == 'OAG']
NAG = train_shuffled[train_shuffled['AGGRESSION_CLASS'] == 'NAG']

concated_train = pd.concat([CAG, OAG, NAG], ignore_index=True)
concated_train['LABEL'] = 0

concated_train.loc[concated_train['AGGRESSION_CLASS'] == 'CAG', 'LABEL'] = 0
concated_train.loc[concated_train['AGGRESSION_CLASS'] == 'OAG', 'LABEL'] = 1
concated_train.loc[concated_train['AGGRESSION_CLASS'] == 'NAG', 'LABEL'] = 2


#------------------Valid---------------------
CAG = valid_shuffled[valid_shuffled['AGGRESSION_CLASS'] == 'CAG']
OAG = valid_shuffled[valid_shuffled['AGGRESSION_CLASS'] == 'OAG']
NAG = valid_shuffled[valid_shuffled['AGGRESSION_CLASS'] == 'NAG']

concated_valid = pd.concat([CAG, OAG, NAG], ignore_index=True)
concated_valid['LABEL'] = 0

concated_valid.loc[concated_valid['AGGRESSION_CLASS'] == 'CAG', 'LABEL'] = 0
concated_valid.loc[concated_valid['AGGRESSION_CLASS'] == 'OAG', 'LABEL'] = 1
concated_valid.loc[concated_valid['AGGRESSION_CLASS'] == 'NAG', 'LABEL'] = 2


#------------------Test---------------------
CAG = test_shuffled[test_shuffled['AGGRESSION_CLASS'] == 'CAG']
OAG = test_shuffled[test_shuffled['AGGRESSION_CLASS'] == 'OAG']
NAG = test_shuffled[test_shuffled['AGGRESSION_CLASS'] == 'NAG']

concated_test = pd.concat([CAG, OAG, NAG], ignore_index=True)
concated_test['LABEL'] = 0

concated_test.loc[concated_test['AGGRESSION_CLASS'] == 'CAG', 'LABEL'] = 0
concated_test.loc[concated_test['AGGRESSION_CLASS'] == 'OAG', 'LABEL'] = 1
concated_test.loc[concated_test['AGGRESSION_CLASS'] == 'NAG', 'LABEL'] = 2

In [None]:
print(concated_train["LABEL"].value_counts())
print(concated_valid["LABEL"].value_counts())
print(concated_test["LABEL"].value_counts())

In [None]:
# X data
X_train = concated_train['TEXT']
X_valid = concated_valid['TEXT']
X_test = concated_test['TEXT']

In [None]:
# Class info
class_list = ['CAG', 'OAG', 'NAG']
print('Class list:', class_list)
class_num = len(class_list)
print('Number of class:', class_num)

In [None]:
# one-hot encoding
y_train = to_categorical(concated_train['LABEL'], num_classes=3)
y_valid = to_categorical(concated_valid['LABEL'], num_classes=3)
y_test = to_categorical(concated_test['LABEL'], num_classes=3)

print('Size of train labels:', y_train.shape)
print('Size of valid labels:', y_valid.shape)
print('Size of test labels:', y_test.shape)

## - Pytorch Dataset: Converting BERT Input

In [None]:
# Train
train_dataset = BigBirdData(X_train, y_train, word_max_len)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size,
                      collate_fn=train_dataset.collate_fn)

# Valid
validate_dataset = BigBirdData(X_valid, y_valid, word_max_len)
validate_sampler = SequentialSampler(validate_dataset)
validate_dataloader = DataLoader(validate_dataset, sampler=validate_sampler, batch_size=batch_size,
                    collate_fn=validate_dataset.collate_fn)

# Test
test_dataset = BigBirdData(X_valid, y_valid, word_max_len)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size,
                    collate_fn=test_dataset.collate_fn)

In [None]:
inspect(*next(iter(train_dataloader)), names='tokens_ids, labels, masks')

## - BERT-Based Aggression Identification Model

## - Train & Valid

In [None]:
def train(data, model, optimizer, loss_fn):

    # Set train
    model.train()

    # Get batch data
    tokens_ids, labels, masks = data
    # Inference
    outputs = model(tokens_ids, masks)

    # Get loss
    loss = loss_fn(outputs, labels)

    # Get class
    preds = outputs.argmax(-1)
    labels = labels.argmax(-1)

    acc = (sum(preds==labels) / len(labels))

    # Init gradient
    model.zero_grad()

    # Backword propagation
    loss.backward()

    # Optimization
    optimizer.step()

    return loss, acc

In [None]:
@torch.no_grad()
def validate(data, model, loss_fn):

    # Set valid
    model.eval()

    # Get batch data
    tokens_ids, labels, masks = data

    # Inference
    outputs = model(tokens_ids, masks)

    # Get loss
    loss = loss_fn(outputs, labels)

    # Get class
    preds = outputs.argmax(-1)
    labels = labels.argmax(-1)

    acc = (sum(preds==labels) / len(labels))

    total_predict.extend(list(preds.cpu().numpy()))
    total_label.extend(list(labels.cpu().numpy()))

    return loss, acc

In [None]:
@torch.no_grad()
def test(data, model, loss_fn):

    # Set valid
    model.eval()

    # Get batch data
    tokens_ids, labels, masks = data

    # Inference
    outputs = model(tokens_ids, masks)

    # Get loss
    loss = loss_fn(outputs, labels)

    # Get class
    preds = outputs.argmax(-1)
    labels = labels.argmax(-1)

    acc = (sum(preds==labels) / len(labels))

    return loss, acc

In [None]:
log = Logger()

# Convert to cuda
model = Bert_Aggression_Identification_Model(h1, h2, h3, h4, class_num, drop_out_rate=0.5).to(device)
# Loss function
loss_fn = nn.CrossEntropyLoss()

# 


In [None]:
# Optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Gradient sceduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=1)

In [None]:
for epoch in range(epochs):
    n_batch = len(train_dataloader)
    i = 0
    for data in train_dataloader:
        train_loss, train_acc = train(data, model, optimizer, loss_fn)
        pos = epoch + ((i+1)/n_batch)
        log.record(pos=pos, train_loss=train_loss,
                   train_acc=train_acc, end='\r')
        i += 1
    total_predict = []
    total_label = []

    n_batch = len(validate_dataloader)
    for i, data in enumerate(validate_dataloader):
        val_loss, val_acc = validate(data, model, loss_fn)
        pos = epoch + ((i+1)/n_batch)
        log.record(pos=pos, val_loss=val_loss, val_acc=val_acc,end = "\r")

    scheduler.step()
    log.report_avgs(epoch+1)

In [None]:
torch.save(model.state_dict(), 'models/model.pt')

In [None]:
log.plot_epochs(epochs)

## - F1 Score & Confusion Matrix

In [None]:
# F1 score
print('F1 scores:', f1_score(torch.tensor(total_label), torch.tensor(total_predict), average=None))

In [None]:
# Confusion Matrix
plot_confusion_matrix(np.array(total_label), np.array(total_predict), class_list=class_list, normalize=False,
                      title='Confusion Matrix')

## Testing

In [None]:
test_loss_list = []
test_acc_list = []

for i, data in enumerate(test_dataloader):
    test_loss, test_acc = test(data, model, loss_fn)
    test_loss_list.append(test_loss.cpu().numpy())
    test_acc_list.append(test_acc.cpu().numpy())

print(f"Test accuracy: {np.array(test_acc_list).mean()}")
print(f"Test loss: {np.array(test_loss_list).mean()}")