In [54]:
import random
from os import listdir

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

SEED = 6
EPOCHS = 20

First, we need to prepare embeddings to map our words into vectors. We'll do so using GloVe's pretrained vectors. We elect to use the 50-dimensional vectors, the smallest available, to save memory.

In [2]:
words = list()
idx = 0
word2idx = dict()
vectors = list()

with open(f'./glove/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
        
vectors = np.stack(vectors)
print(vectors[0])

[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


In [3]:
glove = {w: vectors[word2idx[w]] for w in words}

In [4]:
glove['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])

And done. Now let's collect every word used in our dataset of blogs.

In [5]:
vocab = set()

for path in listdir('./blogs/'):
    path = "./blogs/" + path
    with open(path, 'rb') as f:
        for l in f:
            line = l.split()

            # we skip lines of size 0, whitespace
            # and of size 1, almost always a date or meaningless title
            if len(line) > 1:
                for word in line:
                    try:
                        vocab.add(word.decode().lower())
                    except UnicodeDecodeError:
                        continue

In [6]:
print(len(vocab))

2836096


That's a lot of unique words. A decent number are likely very similar words that have additional punctuation but actually mean the same thing. Probably a lot we can cut done with a little preprocessing later.

Finally, we create a weights matrix which will be used to turn our words into vectors. This will appear in our model as an Embedding layer. Each word gets its pretrained vector if found in the glove vocab. Otherwise, we initialize a random vector.

In [7]:
word2idx = dict()
weights_matrix = np.zeros((len(vocab), 50))
words_found = 0

for i, word in enumerate(vocab):
    word2idx[word] = i
    try:
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(50,))

In [8]:
print(weights_matrix[0])

[-1.26805844  0.17542932  0.09946987  0.42838395 -0.03506367 -0.41414996
  0.58659506  0.06062696 -0.18226738  0.03094984  0.49504917  0.2409048
 -1.17906241  0.37113817 -0.75156576  0.06108673 -0.89887277  1.16876693
 -0.02722298 -0.03428933  0.43090654  0.69852335 -0.49249889  0.899596
 -0.22459108  0.02736743  0.46385264 -0.60646151  0.34905923  0.21263014
  0.5476872  -0.64117071 -0.33807404 -0.09580423 -1.26641996  0.22297221
  0.65254555 -0.03121035 -0.62946253 -0.21396468  0.80200424  0.84071385
  0.42730024 -1.39784548 -0.6425851  -0.66098923 -0.17915093 -0.0311042
 -0.0955524  -1.19184747]


In [9]:
print(float(words_found / len(vocab)))

0.06377957586767162


We're done, but unfortunately we're only getting about 6.4% of our words a pretrained embedding. Bloggers don't like to be consistent with things like punctuation. Hopefully, the missing words are mostly one-offs that we wouldn't be able to extract any value out of anyway and what we have is good enough. We'll see.

Now to load in our data itself. For each blog entry, we'll represent the text as a sequence of integers. Those will be our features. For now, we'll do our gender model and have our target be 0 for male and 1 for female.

In [31]:
random.seed(6)
files = listdir('./blogs/')
random.shuffle(files)

partition = dict()
partition['train'] = files[:int(len(files) * 0.75)]
partition['test'] = files[int(len(files) * 0.75):]

In [36]:
print(len(partition['train']))
print(len(partition['test']))
assert len(partition['train']) + len(partition['test']) == len(files)

14490
4830


In [37]:
# maps each file name to an integer ID
labels = {files[i]: i for i in range(len(files))}

In [50]:
class BlogDataset(Dataset):
    
    def __init__(self, file_names, labels, word2idx):
        self.labels = labels
        self.file_names = file_names
        self.word2idx = word2idx
        
    def __len__(self):
        return len(self.file_names)
    
    def __getitem__(self, index):
        file = self.file_names[index]
        
        idxs = list()
        
        path = "./blogs/" + file
        with open(path, 'rb') as f:
            for l in f:
                line = l.split()

                # we skip lines of size 0, whitespace
                # and of size 1, almost always a date or meaningless title
                if len(line) > 1:
                    for word in line:
                        idxs.append(word2idx[word])
        
        gender = file.split('.')[1]
        y = 1 if gender == 'female' else 0
        
        return idxs, y
        

In [53]:
train_set = BlogDataset(partition['train'], labels, word2idx)
train_loader = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=4)

test_set = BlogDataset(partition['test'], labels, word2idx)
test_loader = DataLoader(train_set, batch_size=256, shuffle=False, num_workers=4)

In [55]:
for epoch in range(EPOCHS):
    for local_batch, local_labels in train_loader:
        print(local_batch)
        print(local_labels)
        break
    break

OSError: [Errno 12] Cannot allocate memory