In [366]:
import torch
from torch.autograd import Variable

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as shc

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")

%load_ext autoreload
%autoreload 2

In [7]:
df = pd.read_csv('clean_data/clean_data.csv', index_col=0)
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,1.0,Thousands,NNS,O
1,1.0,of,IN,O
2,1.0,demonstrators,NNS,O
3,1.0,have,VBP,O
4,1.0,marched,VBN,O


In [18]:
for r in df.values[:10]:
    print(r[1])

Thousands
of
demonstrators
have
marched
through
London
to
protest
the


In [31]:
# we'll first turn our dataframe to a list of lists of tuples, since this is the infinitely
# more convenient data structure for neural nets and torch

df.Word.to_list()

def df_to_torch_list(df):
    """Function takes in dataframe with four columns:
    Sentence #; Word; POS; Tag.
    -------------------------------------------------
    Returns: 
    - input_data as a list of lists (each a sentence) of tuples
    where each tuple is (word; POS)
    - target_data - list of lists (each a sentence) of Named 
    Entity Tags (e.g. 'O', 'B-geo', 'I-art', etc)
    """
    
    input_data = []
    target_data = []
    data = df.copy()
    for sent_ind in range(1,len(data['Sentence #'].unique().astype(int))):
        sent_df = data.loc[data['Sentence #'] == sent_ind]
        sent_lst = []
        sent_target_lst = []
        for row in sent_df.values:
            sent_lst.append((row[1], row[2]))
            sent_target_lst.append(row[3])
        input_data.append(sent_lst)
        target_data.append(sent_target_lst)
    return input_data, target_data
        
        

In [32]:
%time
input_data, target_data = df_to_torch_list(df)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


In [39]:
input_data[1]
# target_data[0]

[('Families', 'NNS'),
 ('of', 'IN'),
 ('soldiers', 'NNS'),
 ('killed', 'VBN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('conflict', 'NN'),
 ('joined', 'VBD'),
 ('the', 'DT'),
 ('protesters', 'NNS'),
 ('who', 'WP'),
 ('carried', 'VBD'),
 ('banners', 'NNS'),
 ('with', 'IN'),
 ('such', 'JJ'),
 ('slogans', 'NNS'),
 ('as', 'IN'),
 ('"', '.'),
 ('Bush', 'NNP'),
 ('Number', 'NN'),
 ('One', 'CD'),
 ('Terrorist', 'NN'),
 ('"', '.'),
 ('and', 'CC'),
 ('"', '.'),
 ('Stop', 'VB'),
 ('the', 'DT'),
 ('Bombings', 'NNS'),
 ('.', '.'),
 ('"', '.')]

In [43]:
len(target_data)

2998

In [44]:
len(input_data)

2998

In [50]:
len(input_data[2548])

70

We know what lenght of our longest sentence is. This is important since our neural net will require all inputs to be of equal length and we'll pad shorter sentences to length. 

In [85]:
len(glove_dict)

400000

In [114]:
def load_glove_vects(file = 'glove/glove.6B.50d.txt', vdim=None):
    """Function that loads the Global representation Vectors
    and returns them as a dictionary. 
    -----------------
    Returns:
    glove_dict - (dict) key - word (str), value - n-dimensional np array """
    glove_dict = {}
#     total_vocab = vocab
    if type(vdim)==int:
        file = f'glove/glove.6B.{vdim}d.txt'
    avg_vect = np.zeros((vdim,))
    with open(file, 'rb') as f:
        for line in f:
            parts = line.split()
            word = parts[0].decode('utf-8')
            vector = np.array(parts[1:], dtype=np.float32)
            glove_dict[word] = vector
            avg_vect += vector
        # creating the vector for new, UNKnown words in the vocabulary
        # NOTE, this is NOT the same as the word "unk", which is 
        # present in glove's vocabulary
        glove_dict['UNK'] = avg_vect/len(glove_dict)
        glove_dict['PAD'] = np.zeros((vdim,))
    return glove_dict
    

In [115]:
%time
glove_dict = load_glove_vects(vdim=50);

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 24.8 µs


In [117]:
glove_dict['UNK']

array([-0.12920061, -0.28866239, -0.01224894, -0.05676689, -0.20211109,
       -0.08389026,  0.33359737,  0.16045146,  0.03867495,  0.17833092,
        0.0469662 , -0.00285779,  0.29099851,  0.04613723, -0.20923842,
       -0.066131  , -0.06822448,  0.07665885,  0.31339918,  0.17848512,
       -0.12257719, -0.09916928, -0.07495973,  0.06413206,  0.14441256,
        0.608946  ,  0.17463101,  0.05335403, -0.01273826,  0.03474108,
       -0.81239567, -0.04688727,  0.20193533,  0.20311115, -0.03935654,
        0.06967518, -0.01553655, -0.03405275, -0.06528025,  0.12250092,
        0.13992005, -0.17446305, -0.08011841,  0.08495219, -0.01041645,
       -0.13704901,  0.20127088,  0.10069294,  0.00653007,  0.0168515 ])

In [67]:
from sklearn.model_selection import train_test_split

In [76]:
train_sents, test_sents, train_labels, test_labels = train_test_split(input_data, target_data, test_size=.2, shuffle=False)
train_sents, valid_sents, train_labels, valid_labels = train_test_split(train_sents, train_labels, test_size=.15, shuffle=False)

In [79]:
len(valid_labels)

360

In [80]:
valid_sents[0]

[('China', 'NNP'),
 ('rejects', 'VBZ'),
 ('the', 'DT'),
 ('criticism', 'NN'),
 (',', '.'),
 ('saying', 'VBG'),
 ('internal', 'JJ'),
 ('affairs', 'NNS'),
 ('should', 'MD'),
 ('be', 'VB'),
 ('handled', 'VBN'),
 ('by', 'IN'),
 ('China', 'NNP'),
 ("'s", 'POS'),
 ('government', 'NN'),
 ('and', 'CC'),
 ('citizens', 'NNS'),
 (',', '.'),
 ('not', 'RB'),
 ('outsiders', 'NNS'),
 ('.', '.')]

To do:
* create input layer, matching incoming words to their respective glove_dict arrays
* 

In [98]:
train_sents[0]

[('Thousands', 'NNS'),
 ('of', 'IN'),
 ('demonstrators', 'NNS'),
 ('have', 'VBP'),
 ('marched', 'VBN'),
 ('through', 'IN'),
 ('London', 'NNP'),
 ('to', 'TO'),
 ('protest', 'VB'),
 ('the', 'DT'),
 ('war', 'NN'),
 ('in', 'IN'),
 ('Iraq', 'NNP'),
 ('and', 'CC'),
 ('demand', 'VB'),
 ('the', 'DT'),
 ('withdrawal', 'NN'),
 ('of', 'IN'),
 ('British', 'JJ'),
 ('troops', 'NNS'),
 ('from', 'IN'),
 ('that', 'DT'),
 ('country', 'NN'),
 ('.', '.')]

In [99]:
def generate_tag_set(targets_list):
    """Function takes in a list of NE tags (which should include at least one instance of 
    every possible NE tag) and returns a dict matching each NE tag to a unique int.
    Returns:
    tag_map - (dict) of NE tag - associated int pairs"""
    ne_dict = {}
    i = 0
    for sublist in targets_list:
        for ne in sublist:
            if ne in ne_dict.keys():
                continue
            else:
                ne_dict[ne] = i
                i += 1
    return ne_dict

In [103]:
ne_dict = generate_tag_set(train_labels)
ne_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [311]:
def sent_to_vect(feature_list : list, targets_list : list, vocab : dict, ne_dict : dict):
    """Function takes in list of lists of dictionaries (input data), target NE labels,
    a vocabulary (dictionary) and a dict of NE tags and their corresponding identifiers;
    Returns a list of vectorised input data"""
    vect_sentences = []        
    vect_label_sentences = []
    
    for sentence in feature_list:     
        #replace each token by its index if it is in vocab
        #else use index of UNK
        sentence_vectors = [vocab[token[0].lower()] if token[0].lower() in vocab.keys() 
             else vocab['UNK']
             for token in sentence]
        vect_sentences.append(sentence_vectors)
        
    for sentence in targets_list:
        #replace each label by its index
        try:
            label_sent = [ne_dict[label] for label in sentence]
        except:
            generate_tag_set(targets_list)
            label_sent = [ne_dict[label] for label in sentence]
        vect_label_sentences.append(label_sent) 
        
    return vect_sentences, vect_label_sentences

In [312]:
train_vect_sent , train_label_sent = sent_to_vect(train_sents, train_labels, glove_dict, ne_dict)

In [313]:
len(train_label_sent[200]) == len(train_vect_sent[200])

True

In [314]:
glove_dict['PAD']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [360]:
class DataGenerator():
    def __init__(self, )

    def prep_batch(self, batch_sentences : list, batch_sentences_labels : list, vocab = self.vocab, word_vect_dim = 50):
        """Function takes in a list of lists (each sublist a sentence of n-dimension
        numpy arrays), the associated list of lists of NE labels and a vocabulary (dict)"""
        
        #compute length of longest sentence in batch
        batch_max_len = max([len(sentence) for sentence in batch_sentences_labels])
        #prepare a numpy array with the data, initializing the data with 'PAD' 
        #and all labels with -1; initializing labels to -1 differentiates tokens 
        #with tags from 'PAD' tokens
        #note the dimensional change here as we are effectively about to 
        # concatenate the sentences along the 2nd dimension
        batch_data = np.zeros((len(batch_sentences), batch_max_len, word_vect_dim))
        batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))
        #copy the data to the numpy array
        for j in range(len(batch_sentences)):
            #accessing individual sentence below
            cur_len = len(batch_sentences[j])

            for k in range(len(batch_sentences[j])):
                #accessing individual word vectors below
                batch_data[j,k, :] = batch_sentences[j][k].reshape(1,-1)

            batch_labels[j][:cur_len] = batch_sentences_labels[j]

        #since all data are indices, we convert them to torch LongTensors
        batch_data, batch_labels = torch.Tensor(batch_data), torch.Tensor(batch_labels)

        #convert Tensors to Variables
        # Torch tensors and torch Variables are almost the same, the latter being a wrapper fn
        # that allows for additional methods to be called onto the underlying tensor. 
        # So we're reassigning them as Variables for extra future flexibility
    #     batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
        yield batch_data, batch_labels
    

In [362]:
import preprocessing_for_torch as prep

In [369]:
%time
train_batch_1_sent, train_batch_1_labels = prep.prep_batch(train_vect_sent[:10], train_label_sent[:10], glove_dict)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [356]:
train_sents[0]

[('Thousands', 'NNS'),
 ('of', 'IN'),
 ('demonstrators', 'NNS'),
 ('have', 'VBP'),
 ('marched', 'VBN'),
 ('through', 'IN'),
 ('London', 'NNP'),
 ('to', 'TO'),
 ('protest', 'VB'),
 ('the', 'DT'),
 ('war', 'NN'),
 ('in', 'IN'),
 ('Iraq', 'NNP'),
 ('and', 'CC'),
 ('demand', 'VB'),
 ('the', 'DT'),
 ('withdrawal', 'NN'),
 ('of', 'IN'),
 ('British', 'JJ'),
 ('troops', 'NNS'),
 ('from', 'IN'),
 ('that', 'DT'),
 ('country', 'NN'),
 ('.', '.')]

In [357]:
glove_dict['thousands']

array([ 1.1515e+00, -3.9703e-01,  9.7350e-01, -8.3455e-01, -1.4785e-01,
       -4.7469e-01, -9.8629e-01,  4.4072e-01,  1.0985e-01,  7.3914e-03,
       -4.5690e-01, -1.2794e+00,  1.0253e+00, -5.3370e-01,  1.0906e+00,
       -3.6994e-01, -1.7323e-03, -1.2934e-02, -2.0921e-01, -8.0484e-01,
        3.0218e-01,  2.9622e-01,  4.3949e-02, -6.2642e-02, -1.1756e-02,
       -1.2806e+00, -2.3914e-01, -5.0524e-01,  2.8103e-01, -3.1305e-01,
        3.0938e+00,  6.8201e-01, -3.8915e-01, -5.9624e-01, -6.8694e-01,
        7.9195e-01, -1.5878e-01, -7.9453e-01, -2.0664e-01,  4.5275e-01,
       -4.2613e-01,  3.5096e-01,  5.5050e-01,  2.5910e-01,  7.1832e-01,
       -5.3633e-02, -1.0610e+00, -4.6405e-01, -9.2481e-01, -1.6236e+00],
      dtype=float32)

In [358]:
train_batch_1_sent[0][0]

tensor([ 1.1515e+00, -3.9703e-01,  9.7350e-01, -8.3455e-01, -1.4785e-01,
        -4.7469e-01, -9.8629e-01,  4.4072e-01,  1.0985e-01,  7.3914e-03,
        -4.5690e-01, -1.2794e+00,  1.0253e+00, -5.3370e-01,  1.0906e+00,
        -3.6994e-01, -1.7323e-03, -1.2934e-02, -2.0921e-01, -8.0484e-01,
         3.0218e-01,  2.9622e-01,  4.3949e-02, -6.2642e-02, -1.1756e-02,
        -1.2806e+00, -2.3914e-01, -5.0524e-01,  2.8103e-01, -3.1305e-01,
         3.0938e+00,  6.8201e-01, -3.8915e-01, -5.9624e-01, -6.8694e-01,
         7.9195e-01, -1.5878e-01, -7.9453e-01, -2.0664e-01,  4.5275e-01,
        -4.2613e-01,  3.5096e-01,  5.5050e-01,  2.5910e-01,  7.1832e-01,
        -5.3633e-02, -1.0610e+00, -4.6405e-01, -9.2481e-01, -1.6236e+00])

In [359]:
train_batch_1_labels[0]

tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0., -1., -1., -1., -1.,
        -1., -1.])