In [34]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import preprocessing as tfkp
import pickle
import preprocessing as p

In [35]:
# embedding stuff

def get_embeddings(dim, dataset):
    sep = " "
    if dataset == "wiki":
        vector_file = f"glove_embeddings/glove.6B.{dim}d.txt"
    else:
        vector_file = f"glove_embeddings/glove.twitter.27B.{dim}d.txt"

    embed_dict = {}
    with open(vector_file, "r", encoding="utf8") as file:
        for line in file.readlines():
            row = line.strip().split(sep)
            embed_dict[row[0]] = row[1:]    # word : weights

    return embed_dict

def map_weights(embed_dict, vocab, embed_size): # embed size is embedding dim
    vocab_size = len(vocab)
    weights = np.zeros((vocab_size, embed_size))

    n_missed = 0
    words_missed = []
    for k,v in vocab.items():
        try:
            weights[v] = embed_dict[k]  # weights[v] is an index, embed_dict[k] is the list of weights
        except:
            n_missed += 1
            words_missed.append(k)
    print(f"{n_missed} embeddings missed of {vocab_size}")
    return weights, words_missed

In [36]:
# data stuff

def load_data(filename):
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    for i in range(len(data)):
        if(HASH_REMOVE):
            x_text.append(p.tokenize((data[i]['text']).encode('utf-8')))
        else:
            x_text.append(data[i]['text'])
        labels.append(data[i]['label'])
    return x_text,labels

def get_filename(dataset):
    global N_CLASS, HASH_REMOVE
    if(dataset=="twitter"):
        filename = "../data/twitter_data.pkl"
        N_CLASS = 3
        HASH_REMOVE = False
    elif(dataset=="formspring"):
        N_CLASS = 2
        filename = "../data/formspring_data.pkl"
        HASH_REMOVE = False
    elif(dataset=="wiki"):
        N_CLASS = 2
        filename = "../data/wiki_data.pkl"
        HASH_REMOVE = False
    return filename

def get_data_train_test(dataset, dim):
    x_text, labels = load_data(get_filename(dataset))
    trainx, testx, trainy, testy = train_test_split(x_text, labels, random_state = 42, test_size=0.1)

    post_length = np.array([len(x.split(" ")) for x in trainx])

    # tfkp = tensorflow.keras.preprocessing
    tokenizer = tfkp.text.Tokenizer(oov_token="<UNK>")
    tokenizer.fit_on_texts(trainx)

    convert = lambda x: tfkp.sequence.pad_sequences(tokenizer.texts_to_sequences(x),
                                                    maxlen=dim,
                                                    padding='post', truncating='post')
    trainx = convert(trainx)
    trainy = np.array(trainy)
    testx = convert(testx)
    testy = np.array(testy)

    return dataset, trainx, trainy, testx, testy, tokenizer

In [37]:
# put it all together

def get_weights_and_data(dataset, dim=50):
    dataset, trainx, trainy, testx, testy, tokenizer = get_data_train_test(dataset, dim)
    embed_dict = get_embeddings(50, dataset)

    weights, words_missed = map_weights(embed_dict, tokenizer.word_index, dim)
    return trainx, trainy, testx, testy, weights   # ready for use

In [38]:
trainx, trainy, testx, testy, weights = get_weights_and_data("wiki")

80076 embeddings missed of 160395


In [40]:
print(np.shape(trainx))
print(np.shape(trainy))
print(np.shape(testx))
print(np.shape(testy))
print(np.shape(weights))

(104277, 50)
(104277,)
(11587, 50)
(11587,)
(160395, 50)
