In [1]:
import numpy as np
import pandas as pd
import pickle
from urllib.parse import urlparse
from nltk.corpus import stopwords

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from tensorflow.keras import preprocessing as tfkp

In [2]:
import string

def load_data():
    filename = "../../data/twitter_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [3]:
def is_url(url):
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False

In [19]:
# embedding stuff

def get_embeddings(dim, dataset):
    sep = " "
    if dataset == "wiki":
        vector_file = f"glove_embeddings/glove.6B.{dim}d.txt"
    else:
        vector_file = f"glove_embeddings/glove.twitter.27B.{dim}d.txt"

    print("Loading data from file: " + vector_file)
    embed_dict = {}
    with open(vector_file, "r", encoding="utf8") as file:
        for line in file.readlines():
            row = line.strip().split(sep)
            embed_dict[row[0]] = row[1:]    # word : weights

    return embed_dict

def map_weights(embed_dict, vocab, embed_size): # embed size is embedding dim
    vocab_size = len(vocab)
    weights = np.zeros((vocab_size, embed_size))

    n_missed = 0
    words_missed = []
    for k,v in vocab.items():
        try:
            weights[v] = embed_dict[k]  # weights[v] is an index, embed_dict[k] is the list of weights
        except:
            n_missed += 1
            words_missed.append(k)
    print(f"{n_missed} embeddings missed of {vocab_size}")
    return weights, words_missed

### Loading Data

In [5]:
x_text, labels_og = load_data()
labels, uniques = pd.factorize(labels_og)

Loading data from file: ../../data/twitter_data_fixed.pkl


### Intitial Twitter-Specific Pre-processing

In [6]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

# decode to UTF-8
comments['comment'] = comments['comment'].str.decode("utf-8")

# remove missing rows
comments['comment'].dropna(inplace=True)

# remove usernames
comments['comment'] = comments['comment'].str.replace('(\@\w+.*?)',"", regex=True)

# lower case everything
comments['comment'] = comments['comment'].str.lower()

# remove URLs
comments['comment'] = [' '.join(y for y in x.split() if not is_url(y)) for x in comments['comment']]

# remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# traditionally, would also lemmatize but this was not done in the main data

### Train-Test Split and Tokenization

In [7]:
dim = 25 #Using smaller dimensions due to twitter being heaps smaller

In [8]:
X_train_pretoken, X_midway_pretoken, y_train, y_midway = train_test_split(comments['comment'], comments['attack'], random_state = 42, test_size=0.2)

In [10]:
tokenizer = tfkp.text.Tokenizer(oov_token="<UNK>", filters='!"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(X_train_pretoken)

convert = lambda x: tfkp.sequence.pad_sequences(tokenizer.texts_to_sequences(x),
                                                    maxlen=dim,
                                                    padding='post', truncating='post')

X_train = convert(X_train_pretoken)
X_midway = convert(X_midway_pretoken)

In [11]:
X_test, X_val, y_test, y_val = train_test_split(X_midway, y_midway, random_state = 42, test_size=0.5)

### Oversampling

In [12]:
oversample = RandomOverSampler(random_state=42)
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

### Glove Embedding Weights

In [20]:
embed_dict = get_embeddings(dim, 'twitter')
glove_weights, glove_words_missed = map_weights(embed_dict, tokenizer.word_index, dim)

Loading data from file: glove_embeddings/glove.twitter.27B.25d.txt


FileNotFoundError: [Errno 2] No such file or directory: 'glove_embeddings/glove.twitter.27B.25d.txt'

In [23]:
from os import walk

filenames = next(walk(''), (None, None, []))[2]  # [] if no file
print(filenames)

[]


#### Word2vec Weights

In [None]:
#space here for word2vec

#### Checks

In [14]:
print(np.shape(X_train))
print(np.shape(X_train_over))
print(np.shape(X_val))
print(np.shape(X_test))
print(np.shape(y_train))
print(np.shape(y_train_over))
print(np.shape(y_val))
print(np.shape(y_test))
print(np.shape(weights))

(12872, 25)
(26508, 25)
(1609, 25)
(1609, 25)
(12872,)
(26508,)
(1609,)
(1609,)


NameError: name 'weights' is not defined

In [15]:
from collections import Counter
print(Counter(y_train))
print(Counter(y_train_over))

Counter({0: 8836, 2: 2503, 1: 1533})
Counter({2: 8836, 0: 8836, 1: 8836})


### Write Embedders

In [None]:
from keras.layers import Embedding

vocab_size = len(tokenizer.word_index)
embed_size = dim
inp_dim = 25

random_embedder = Embedding(vocab_size, embed_size, input_length=inp_dim, trainable=True)
glove_embedding = Embedding(vocab_size, embed_size, embeddings_initializer=keras.initializers.Constant(glove_weights),
                            trainable=False)
word2vec_embedding = Embedding(vocab_size, embed_size, embeddings_initializer=keras.initializers.Constant(word2vec_weights),
                               trainable=False)