### Libraries

In [15]:
# Importing os.
import os

# Importing urllib.request.
import urllib.request

# Importing zipfile.
import zipfile

# Importing pandas.
import pandas as pd

# Importing numpy.
import numpy as np

# Importing train_test_split.
from sklearn.model_selection import train_test_split

# Importing pad_sequences.
from keras_preprocessing.sequence import pad_sequences

# Importing Sequential.
from keras.models import Sequential

# Importing Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation.
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation

# Importing Adam.
from keras.optimizers import Adam

### Data Preprocessing

In [2]:
# Function used to download .zips.
def downloader(url, folder_name, filename):

  # Defining data folder path.
  data_path = os.path.join(os.getcwd(), folder_name)

  # Creating data folder.
  if not os.path.exists(data_path):
      os.makedirs(data_path)

  # Defining .zip file path.
  zip_path = os.path.join(os.getcwd(), folder_name, filename)

  # Requesting .zip file.
  if not os.path.exists(zip_path):
      urllib.request.urlretrieve(url, zip_path)

  # Extracting data from .zip.
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
      zip_ref.extractall(path = data_path)

  # Returning data_path and zip_path.
  return data_path, zip_path

In [3]:
# Downloading dataset.
data_path, _ = downloader(url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip", folder_name = "data", filename = "dependency_treebank.zip")

# Downloading glove.
glove_path, _ = downloader(url = "https://nlp.stanford.edu/data/glove.6B.zip", folder_name = "glove", filename = "glove.6B.zip")

In [4]:
# Defining the dataset name.
dataset_name = "dependency_treebank"

# Defining path to first training sample.
file_path = os.path.join(data_path, dataset_name, "wsj_0001.dp")

# Reading first training sample.
if os.path.isfile(file_path):

  # Printing file.
  with open(file_path, mode = "r") as text_file: print(text_file.read())

Pierre	NNP	2
Vinken	NNP	8
,	,	2
61	CD	5
years	NNS	6
old	JJ	2
,	,	2
will	MD	0
join	VB	8
the	DT	11
board	NN	9
as	IN	9
a	DT	15
nonexecutive	JJ	15
director	NN	12
Nov.	NNP	9
29	CD	16
.	.	8

Mr.	NNP	2
Vinken	NNP	3
is	VBZ	0
chairman	NN	3
of	IN	4
Elsevier	NNP	7
N.V.	NNP	12
,	,	12
the	DT	12
Dutch	NNP	12
publishing	VBG	12
group	NN	5
.	.	3



In [5]:
# Defining embedding size.
EMBEDDING_SIZE = 50

# Defining specific glove's file path.
glove_file = os.path.join(os.getcwd(), glove_path, f"glove.6B.{str(EMBEDDING_SIZE)}d.txt")

# Reading lines of file.
with open(glove_file, encoding = "utf8" ) as text_file: 
  lines = text_file.readlines()

# Defining initial vocabulary.
embedding_vocabulary = {}

# Reading single lines.
for line in lines:

  # Splitting line.
  splits = line.split()

  # Storing line into vocabulary.
  embedding_vocabulary[splits[0]] = np.array([float(val) for val in splits[1:]])

# Printing one entry of the vocabulary.
print("The embedding for 'the' is:\n{}.".format(embedding_vocabulary["the"]))

The embedding for 'the' is:
[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01].


In [6]:
# List containing dataframe rows.
dataframe_rows = []

# List containing words of a single sentence.
row_words = []

# List containing tags of a single sentence.
row_tags = []

# Defining data folder path.
folder = os.path.join(data_path, dataset_name)

# Storing rows.
for filename in sorted(os.listdir(folder)):

  # Computing path to file.
  file_path = os.path.join(folder, filename)

  # Checking existance of file.
  if os.path.isfile(file_path):

    # Opening the file.
    with open(file_path, mode = "r") as text_file:

      # Reading lines.
      while True:

        # Reading next line.
        line = text_file.readline()

        # Checking that line is different from "\n" (empty line) and from last line (EOF).
        if line and line != "\n":

          # Storing the word.
          row_words.append(line.split()[0])

          # Storing the POS tag.
          row_tags.append(line.split()[1])

        else:

          # Creating a row.
          dataframe_row = {"file_id": filename.split(".")[0], "sentence": row_words, "tags": row_tags}

          # Appending row.
          dataframe_rows.append(dataframe_row)

          # Resetting row_words list so to store a new sentence.
          row_words = []

          # Resetting row_tags list so to store a new sentence.
          row_tags = []

          # If, in particular, EOF is reached, then break the inner loop.
          if not line: break

# Creating pandas dataframe.
dataframe = pd.DataFrame(dataframe_rows)

# Printing dataframe head.
dataframe.head()

Unnamed: 0,file_id,sentence,tags
0,wsj_0001,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,wsj_0001,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ..."
2,wsj_0002,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP..."
3,wsj_0003,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
4,wsj_0003,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."


In [7]:
# Defining dataframe path.
dataframe_path = os.path.join(folder, dataset_name + ".pkl")

# Saving dataframe.
dataframe.to_pickle(dataframe_path)

In [8]:
# Dummy split. TODO: modify split according to given information.
train, test = train_test_split(dataframe, test_size = 0.2)

# Defining empty words and tags sets.
words, tags = set([]), set([])

# Creating set of words.
for sentence in train["sentence"].tolist():
  for word in sentence:
    words.add(word.lower())

# Creating set of tags
for tags_list in train["tags"].tolist():
  for tag in tags_list:
    tags.add(tag)

# Building vocabulary for words.
word_to_index = {word: i + 2 for i, word in enumerate(list(words))}
word_to_index["PAD"] = 0
word_to_index["OOV"] = 1

# Building vocabulary for tags.
tag_to_index = {tag: i + 1 for i, tag in enumerate(list(tags))}
tag_to_index["PAD"] = 0

In [9]:
word_to_index["the"]

5271

In [10]:
# Tokenising words and  by their indexes in vocabulary
train_sentences, test_sentences, train_tags, test_tags = [], [], [], []

# Tokenising sentences.
for sentence in train["sentence"].tolist():

  # List to store the words' indexes.
  sentence_indexes = []

  # Corventing each word in sentence.
  for word in sentence:

    # Computing index.
    sentence_indexes.append(word_to_index[word.lower()])

  # Appending list of indexes to training set.
  train_sentences.append(sentence_indexes)

# Tokenising sentences.
for sentence in test["sentence"].tolist():

  # List to store the words' indexes.
  sentence_indexes = []

  # Corventing each word in sentence.
  for word in sentence:

    try:

      # Computing index.
      sentence_indexes.append(word_to_index[word.lower()])

    except KeyError:

      # Computing index.
      sentence_indexes.append(word_to_index["OOV"])

  # Appending list of indexes to training set.
  test_sentences.append(sentence_indexes)

# Tokenising tags.
for tags_list in train["tags"].tolist():

  # Computing index.
  train_tags.append([tag_to_index[tag] for tag in tags_list])

# Tokenising tags.
for tags_list in test["tags"].tolist():

  # Computing tags.
  test_tags.append([tag_to_index[tag] for tag in tags_list])

In [11]:
print(train_sentences[0])
print(train_tags[0])

[5271, 7737, 3052, 1728, 1232, 4375, 7859, 3677, 9158, 7399, 8783, 809, 8085, 4506, 6098, 583]
[1, 30, 30, 15, 26, 40, 4, 4, 42, 20, 15, 42, 37, 37, 15, 31]


In [12]:
# Computing length of longest sentence.
MAX_LENGTH = len(max(dataframe["sentence"].tolist(), key = len))

# Padding train and test sets.
train_sentences = pad_sequences(train_sentences, maxlen = MAX_LENGTH, padding = "post")
test_sentences = pad_sequences(test_sentences, maxlen = MAX_LENGTH, padding = "post")
train_tags = pad_sequences(train_tags, maxlen = MAX_LENGTH, padding = "post")
test_tags = pad_sequences(test_tags, maxlen = MAX_LENGTH, padding = "post")

In [13]:
# Number of converted words.
hits = 0

# Number of OOV words.
misses = 0

# Preparing embedding matrix so that OOV words will be assigned a vector of zeros.
embedding_matrix = np.zeros((len(word_to_index), int(EMBEDDING_SIZE)))

# Retrieving embedding vector for each word.
for word, i in word_to_index.items():

  # Computing vector.
  vector = embedding_vocabulary.get(word)

  # Checking vector.
  if vector is not None:
    
    # Populating the embedding matrix.
    embedding_matrix[i] = vector

    # Increasing number of hits.
    hits += 1

  else:

    # Increasing number of misses.
    misses += 1

# Printing number of hits and missed.
print(f"Converted {hits} words. Missed {misses} words.")

Converted 9195 words. Missed 565 words.


In [14]:
embedding_matrix[word_to_index["the"]]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])

In [16]:
# Model sketch. TODO: modify the model.
model = Sequential()
model.add(InputLayer(input_shape = (MAX_LENGTH,))) 
model.add(Embedding(len(word_to_index), EMBEDDING_SIZE, weights = [embedding_matrix], trainable = False))
model.add(Bidirectional(LSTM(256, return_sequences = True)))
model.add(TimeDistributed(Dense(len(tag_to_index))))
model.add(Activation("softmax"))

# Compiling.
model.compile(loss = "categorical_crossentropy", optimizer = Adam(), metrics = ["accuracy"])

# Printing summary.
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 249, 50)           488000    
                                                                 
 bidirectional (Bidirectiona  (None, 249, 512)         628736    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 249, 46)          23598     
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 249, 46)           0         
                                                                 
Total params: 1,140,334
Trainable params: 652,334
Non-trainable params: 488,000
_________________________________________________________________
