### Libraries

In [4]:
# Importing os.
import os

# Importing urllib.request.
import urllib.request  #  download files

# Importing zipfile.
import zipfile

# Importing pandas.
import pandas as pd

### Data Preprocessing

In [2]:
# Path to dataset.
url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

# Defining data folder path.
data_path = os.path.join(os.getcwd(), "data")

# Creating data folder.
if not os.path.exists(data_path):
    os.makedirs(data_path)

# Defining .zip file path.
data_zip = os.path.join(os.getcwd(), "data", "dependency_treebank.zip")

# Requesting .zip file.
if not os.path.exists(data_zip):
    urllib.request.urlretrieve(url, data_zip)

# Extracting data from .zip.
with zipfile.ZipFile(data_zip, "r") as zip_ref:
    zip_ref.extractall(path = data_path)

In [3]:
# Defining the dataset name.
dataset_name = "dependency_treebank"

# Defining path to first training sample.
file_path = os.path.join(data_path, dataset_name, "wsj_0001.dp")

# Reading first training sample.
if os.path.isfile(file_path):
    with open(file_path, mode = "r") as text_file:
        text = text_file.read()
        print(text)

Pierre	NNP	2
Vinken	NNP	8
,	,	2
61	CD	5
years	NNS	6
old	JJ	2
,	,	2
will	MD	0
join	VB	8
the	DT	11
board	NN	9
as	IN	9
a	DT	15
nonexecutive	JJ	15
director	NN	12
Nov.	NNP	9
29	CD	16
.	.	8

Mr.	NNP	2
Vinken	NNP	3
is	VBZ	0
chairman	NN	3
of	IN	4
Elsevier	NNP	7
N.V.	NNP	12
,	,	12
the	DT	12
Dutch	NNP	12
publishing	VBG	12
group	NN	5
.	.	3



In [35]:
# List containing dataframe rows.
dataframe_rows = []

# Defining data folder path.
folder = os.path.join(data_path, dataset_name)

# Storing rows.
for filename in sorted(os.listdir(folder)):

  # Computing path to file.
  file_path = os.path.join(folder, filename)

  # Checking existance of file.
  if os.path.isfile(file_path):

    # Opening the file.
    with open(file_path, mode = "r") as text_file:

        # Reading lines.
        lines = text_file.readlines()

        # Iterating over lines.
        for line in lines:

          # Checking that line is different from "\n" (empty line).
          if line != "\n":

            # Storing the word of line.
            word = line.split()[0]

            # Storing the POS tag of line.
            tag = line.split()[1]

            # Creating a row.
            dataframe_row = {"file_id": filename.split(".")[0], "word": word, "tag": tag}

            # Appening row.
            dataframe_rows.append(dataframe_row)

# Creating pandas dataframe.
dataframe = pd.DataFrame(dataframe_rows)

# Printing dataframe head.
dataframe.head()

Unnamed: 0,file_id,word,tag
0,wsj_0001,Pierre,NNP
1,wsj_0001,Vinken,NNP
2,wsj_0001,",",","
3,wsj_0001,61,CD
4,wsj_0001,years,NNS


In [36]:
# Defining dataframe path.
dataframe_path = os.path.join(folder, dataset_name + ".pkl")

# Saving dataframe.
dataframe.to_pickle(dataframe_path)

In [None]:
# Range of training samples.
TRAIN_SPLIT = range(1, 101)

# Range of validation samples.
VALIDATION_SPLIT = range(101, 151)

# Range of test samples.
TEST_SPLIT = range(151, 200)