# Aggression Detection

https://sites.google.com/view/trac1/shared-task

## Imports

In [None]:
import csv
import nltk
import numpy as np
import time
import matplotlib.pyplot as plt 

from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from google.cloud import translate
from sklearn.metrics import classification_report


## Parameters

In [None]:
MAX_VOCABULARY_SIZE = 20000
MAX_SEQ_LENGTH = 150
EMBEDDING_DIM = 16

EPOCHS = 10
BATCH_SIZE = 32

augmentation_languages = ['ger','fre','ru']

training_data = 'train/agr_en_train.csv'
validation_data = 'train/agr_en_dev.csv'
test_data = 'test/agr_en_fb_gold.csv'

## Constants

In [None]:
STOPWORDS = set(stopwords.words('english'))
categories = {'NAG': [1,0,0], 'CAG': [0,1,0], 'OAG': [0,0,1]}
wnl = nltk.WordNetLemmatizer()

## Define helper functions

In [None]:
def augument_and_save(row, language, filename):
    sentence = str(row[1].strip('\"').strip('\''))

    # Translate sentence to target language
    translation = translate_client.translate(
                sentence,
                source_language='eng',
                target_language=language)['translatedText']

    # Translate sentence from target language to english
    augmentation = translate_client.translate(
                translation,
                source_language=language,
                target_language='eng')['translatedText']

    # Save to disk
    with open('augmented/'+filename+"_"+language+".csv", 'a') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow([row[0], augmentation, row[2]])

    return augmentation

def preprocess_data(sentence):
    # Tokenize sentence
    tokens = nltk.word_tokenize(sentence)

    # Remove non-alphabetic characters
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Lemmatizations
    tokens = [wnl.lemmatize(t) for t in tokens]

    # Remove stop-words
    tokens = [word for word in tokens if word not in STOPWORDS]

    return tokens

## Augment data

In [None]:
translate_client = translate.Client()

with open(training_data) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:            
        filename = training_data.split("/")[-1].replace('.csv', '')

        for lang in augmentation_languages:
            attemps = 0
            while(attemps<3):
                try:
                    augument_and_save(row, lang, filename)
                    break
                except:
                    attemps += 1
                    print("Google Translate API req failed, sleeping for 1 min before trying again..")
                    time.sleep(60)
                    
            if attemps == 3:
                raise RuntimeError("Google Translate API request failed 3 times in a row.")


## Import training, validation, and test data

In [None]:
train_X = []
train_y = []

val_X = []
val_y = []

test_X = []
test_y = []

training_paths = [training_data]

counter = {'NAG':0,'CAG':0,'OAG':0}


# Add paths to the augumented data
for language in augmentation_languages:
    filename = training_data.split("/")[-1].replace('.csv', '')
    training_paths.append('augmented/'+filename+'_'+language+'.csv')

# Import training data
for path in training_paths:
    with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            sentence = str(row[1].strip('\"').strip('\''))
            counter[str(row[2])] += 1
            train_X.append(preprocess_data(sentence))
            train_y.append(categories[str(row[2])])

# Import validation data
with open(validation_data) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            sentence = str(row[1].strip('\"').strip('\''))

            val_X.append(preprocess_data(sentence))
            val_y.append(categories[str(row[2])])
            
# Import test data
with open(test_data) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            sentence = str(row[1].strip('\"').strip('\''))

            test_X.append(preprocess_data(sentence))
            test_y.append(categories[str(row[2])])
            
print(counter)

## Vectorize dataset

In [None]:
tokenizer = Tokenizer(num_words=MAX_VOCABULARY_SIZE)
tokenizer.fit_on_texts(train_X)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
train_X_vec = pad_sequences(tokenizer.texts_to_sequences(train_X), maxlen=MAX_SEQ_LENGTH)
val_X_vec = pad_sequences(tokenizer.texts_to_sequences(val_X), maxlen=MAX_SEQ_LENGTH)
test_X_vec = pad_sequences(tokenizer.texts_to_sequences(test_X), maxlen=MAX_SEQ_LENGTH)

print('train_X_vec shape:', train_X_vec.shape)
print('val_X_vec shape:', val_X_vec.shape)
print('test_X_vec shape:', test_X_vec.shape)

## Convert to Numpy array

In [None]:
train_X_vec = np.array(train_X_vec)
train_y = np.array(train_y)

val_X_vec = np.array(val_X_vec)
val_y = np.array(val_y)

test_X_vec = np.array(test_X_vec)
test_y = np.array(test_y)

## Define model

In [None]:
model = Sequential()
model.add(Embedding(MAX_VOCABULARY_SIZE, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(25, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.0005), metrics=['accuracy'])

print(model.summary())


## Train model

In [None]:
checkpoint_val = ModelCheckpoint('models/{epoch:02d}_val_{val_loss:.4f}.h5', monitor='val_acc', verbose=1, save_best_only=True,mode="min")

history = model.fit(train_X_vec, 
                    train_y, 
                    validation_data=(val_X_vec, val_y), 
                    callbacks=[checkpoint_val],
                    epochs=EPOCHS, 
                    batch_size = BATCH_SIZE)

## Test model

In [None]:
accuracy = model.evaluate(x=test_X_vec, y=test_y)

Y_test = np.argmax(test_y, axis=1)
y_pred = model.predict_classes(test_X_vec)
print(classification_report(Y_test, y_pred))

## Plots

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('categorical cross-entropy loss')
plt.xlabel('epoch')
plt.legend(['training set', 'validation set'], loc='upper right')
plt.show()

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Accuracy')
plt.ylabel('F1-Score')
plt.xlabel('epoch')
plt.legend(['training set', 'validation set'], loc='upper right')
plt.show()