# Import necessary libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import os
import json
import math
import shutil
import datetime

from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score

# Mount drive - if you are using Google Colab

In [2]:
run = 'local' # local, colab
if run == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')

# Load data

In [3]:
# Define folder path 
if run == 'colab':
    path = '/content/drive/My Drive/Hatefulle Ytringer Models/'
else:
    path = './'


In [23]:
# Read data file
Text_df = pd.read_csv(path+'data/tweets_data.csv', sep=',', usecols=['text', 'classification']  ,encoding = "UTF-8")
Text_df

# Check data records

In [24]:
# Number of cases in each class
Text_df['classification'].value_counts()

# Define function to preprocess data

In [6]:
def process_tweet(df):
    '''
    Input: 
        df: a dataframe containing a column 'text' of strings of tweets
    Output:
        df with a column 'tweets_clean'
    
    '''
    #remove URL
    df['tweet_proc'] = df['text'].str.replace(r'http(\S)+', r'')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'http ...', r'')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'http', r'')
    df[df['tweet_proc'].str.contains(r'http')]

    # remove RT, @
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    df[df['tweet_proc'].str.contains(r'RT[ ]?@')]
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'@[\S]+',r'')

    #remove &, < og >
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&amp;?',r'og')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&lt;',r'<')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&gt;',r'>')

    # remove extra space
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'[ ]{2, }',r' ')

    # insert space between punctuation marks
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

    # lower case and strip white spaces at both ends
    df['tweet_proc'] = df['tweet_proc'].str.lower()
    df['tweet_proc'] = df['tweet_proc'].str.strip()

    return df

In [25]:
# clean text
Text_df = process_tweet(Text_df)
Text_df

# List the most common and least common words

In [26]:
data = Counter(''.join(Text_df['tweet_proc'].values).split())

print('Most occurance word: \n\n', data.most_common()[:10])
print('\n\nLeast occurance word: \n\n', data.most_common()[-10:])

We can oberve that the dataset contains a lots of non-alphabetic words. These words does not add value in the model. 

# List the number of words with single occurance

In [27]:
count=0
for word, num in data.items():
    if num==1:
        count+=1
print('{} number of words occur only one time'.format(count))

# Further cleaning of data

In [10]:
# convert smile emoji to smile word
Text_df['tweet_proc'] = Text_df['tweet_proc'].str.replace(':-\)', 'smile')

# convert sad emoji to sad word
Text_df['tweet_proc'] = Text_df['tweet_proc'].str.replace(':-\(', 'trist')

# remove all non-alphabetic characters
Text_df['tweet_proc'] = Text_df['tweet_proc'].str.replace(r'[^a-zåøæ ]', '')

# Remove stop words

In [11]:
stop_words = stopwords.words('norwegian')
stop_words.remove('ikke')
stop_words.remove('ikkje')

Text_df['tweet_proc'] = Text_df['tweet_proc'].apply(lambda x:' '.join(w for w in x.split() if w not in stop_words))

# Remove comments that are shorter than four words after cleaning

In [28]:
Text_df = Text_df[Text_df.tweet_proc.apply(lambda x: len(x.split())>3)]
Text_df

# Count total number of words (vocabulary)

In [29]:
vocab_length = len(Counter(' '.join(Text_df['tweet_proc'].values).split()))
vocab_length

# Convert vocabulary into tokenizer and sequences

In [30]:
tokenizer = Tokenizer(num_words=vocab_length)
tokenizer.fit_on_texts(Text_df['tweet_proc'].values)

X = tokenizer.texts_to_sequences(Text_df['tweet_proc'].values)
X = pad_sequences(X, maxlen=128, padding='post', truncating='post')
Y = Text_df['classification']

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
print('vocab_size: ', vocab_size)

# Save tokenizer

In [15]:
import io
tokenizer_json_2 = tokenizer.to_json()
with io.open(path + 'tokenizer/tokenizer.json_16102021_v1', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json_2, ensure_ascii=False))

# Split the data into train, validation and test sets

In [16]:
X_train, X2, y_train, y2 = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)
X_val, X_test, y_val, y_test = train_test_split(X2, y2, test_size=0.5, stratify=y2)

# replace 99 () as 1

In [17]:
y_train = y_train.replace(99, 1)
y_val = y_val.replace(99, 1)
y_test = y_test.replace(99, 1)

# Count number of cases used in train, test and validation sets

In [31]:
print(f'Train data shape {X_train.shape}, {y_train.shape}, {Counter(y_train)}')
print(f'Validation data shape {X_val.shape}, {y_val.shape}, {Counter(y_val)}')
print(f'Test data shape {X_test.shape}, {y_test.shape}, {Counter(y_test)}')

# Build  an Artificial Neural Network (ANN) model

In [19]:
def create_ann_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 28))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_ann_model_2():
    model = Sequential()
    model.add(Embedding(vocab_size, 32))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# define checkpoint to store model that yields highest accuracy

In [20]:
# create folder to store current model
dt = datetime.datetime.now()
dt = str(dt).replace(' ', '_')[:-7]

if not os.path.exists(path+'checkpoint/'+f'model_{dt}'):
    os.mkdir(path+'checkpoint/'+f'model_{dt}')
else:
    for m in os.listdir(path+'checkpoint/'+f'model_{dt}/'):
        os.remove(path+'checkpoint/'+f'model_{dt}/'+m)

checkpoint_filepath = path+f'checkpoint/model_{dt}/'+'ANN-{epoch:02d}-{val_accuracy:.4f}.hdf5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [32]:
# create model
ann = create_ann_model()
# ann = create_ann_model_2()

# train and validate with default model
ann.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=15, verbose=0, batch_size=10, callbacks=[model_checkpoint_callback])
_, acc = ann.evaluate(X_test, y_test)
print('Test accuracy of default model: ', acc)

# findout best model path from saved folder 
acc = 0
best_model = None
for mpath in os.listdir(path+f'checkpoint/model_{dt}/'):
    val = float(mpath.split('-')[-1].replace('.hdf5',''))
    if val>acc:
        best_model = mpath

# load the best model
checkpoint_filepath = path+f'checkpoint/model_{dt}/'+best_model
ann.load_weights(checkpoint_filepath)

# test the model
yhat_prob = ann.predict(X_test)
_, acc = ann.evaluate(X_test, y_test)
print('Test accuracy of saved model: ', acc)

# Set the probability treshold for classification

In [66]:
yhat = [1 if y>0.5 else 0 for y in yhat_prob]

# Create function to print model metrics

In [67]:
def print_metrics(real, predict):
    print('Accuracy: ', accuracy_score(real, predict))
    print('\nPrecision: ', precision_score(real, predict))
    print('\nrecall: ', recall_score(real, predict))
    print('\nf1_score: ', f1_score(real, predict))
    print('\nconfusion_matrix:\n ', pd.DataFrame(confusion_matrix(real, predict), index=[0, 1], columns=[0, 1]))
    print('\nclassification_report:\n ', classification_report(real, predict))

In [33]:
print_metrics(y_test, yhat)

# AUC ROC curve
<br>

True Positive Rate = True Positives / (True Positives + False Negatives)

False Positive Rate = False Positives / (False Positives + True Negatives)

In [34]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

y_score = np.array(yhat_prob)

fpr, tpr, thresholds = roc_curve(y_test, y_score)
print('ROC_AUC Score: ', roc_auc_score(y_test, y_score))

# find optimal threshold
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold value is:", optimal_threshold)
plot_roc_curve(fpr, tpr)

# Model accuracy based on optimal threshold


In [35]:
yhat = [1 if y>optimal_threshold else 0 for y in yhat_prob]
print_metrics(y_test, yhat)