### Libraries

In [2]:
# Importing os.
import os

# Importing urllib.request.
import urllib.request

# Importing zipfile.
import zipfile

# Importing pandas.
import pandas as pd

### Data Preprocessing

In [46]:
# Function used to download .zips.
def downloader(url, folder_name, filename):

  # Defining data folder path.
  data_path = os.path.join(os.getcwd(), folder_name)

  # Creating data folder.
  if not os.path.exists(data_path):
      os.makedirs(data_path)

  # Defining .zip file path.
  zip_path = os.path.join(os.getcwd(), folder_name, filename)

  # Requesting .zip file.
  if not os.path.exists(zip_path):
      urllib.request.urlretrieve(url, zip_path)

  # Extracting data from .zip.
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
      zip_ref.extractall(path = data_path)

  # Returning data_path and zip_path.
  return data_path, zip_path

In [47]:
# Downloading dataset.
data_path, _ = downloader(url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip", folder_name = "data", filename = "dependency_treebank.zip")

# Downloading glove.
glove_path, _ = downloader(url = "https://nlp.stanford.edu/data/glove.6B.zip", folder_name = "glove", filename = "glove.6B.zip")

In [51]:
# Defining the dataset name.
dataset_name = "dependency_treebank"

# Defining path to first training sample.
file_path = os.path.join(data_path, dataset_name, "wsj_0001.dp")

# Reading first training sample.
if os.path.isfile(file_path):
    with open(file_path, mode = "r") as text_file:
        text = text_file.read()
        print(text)

Pierre	NNP	2
Vinken	NNP	8
,	,	2
61	CD	5
years	NNS	6
old	JJ	2
,	,	2
will	MD	0
join	VB	8
the	DT	11
board	NN	9
as	IN	9
a	DT	15
nonexecutive	JJ	15
director	NN	12
Nov.	NNP	9
29	CD	16
.	.	8

Mr.	NNP	2
Vinken	NNP	3
is	VBZ	0
chairman	NN	3
of	IN	4
Elsevier	NNP	7
N.V.	NNP	12
,	,	12
the	DT	12
Dutch	NNP	12
publishing	VBG	12
group	NN	5
.	.	3



In [54]:
# List containing dataframe rows.
dataframe_rows = []

# List containing words of a single sentence.
row_words = []

# Listi containing tags of a single sentence.
row_tags = []

# Defining data folder path.
folder = os.path.join(data_path, dataset_name)

# Storing rows.
for filename in sorted(os.listdir(folder)):

  # Computing path to file.
  file_path = os.path.join(folder, filename)

  # Checking existance of file.
  if os.path.isfile(file_path):

    # Opening the file.
    with open(file_path, mode = "r") as text_file:

      # Reading lines.
      while True:

        # Reading next line.
        line = text_file.readline()

        # Checking that line is different from "\n" (empty line) and from last line (EOF).
        if line and line != "\n":

          # Storing the word.
          row_words.append(line.split()[0])

          # Storing the POS tag.
          row_tags.append(line.split()[1])

        else:

          # Creating a row.
          dataframe_row = {"file_id": filename.split(".")[0], "sentence": row_words, "tags": row_tags}

          # Appending row.
          dataframe_rows.append(dataframe_row)

          # Resetting row_words list so to store a new sentence.
          row_words = []

          # Resetting row_tags list so to store a new sentence.
          row_tags = []

          # If, in particular, EOF is reached, then break the inner loop.
          if not line: break

# Creating pandas dataframe.
dataframe = pd.DataFrame(dataframe_rows)

# Printing dataframe head.
dataframe.head()

Unnamed: 0,file_id,sentence,tags
0,wsj_0001,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,wsj_0001,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ..."
2,wsj_0002,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP..."
3,wsj_0003,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
4,wsj_0003,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."


In [55]:
# Defining dataframe path.
dataframe_path = os.path.join(folder, dataset_name + ".pkl")

# Saving dataframe.
dataframe.to_pickle(dataframe_path)