# Data Exploration

In [1]:
import pandas as pd
import numpy as np

In [2]:
# variables
path_train = '../data/input/srl_univprop_en.train.conll'
path_dev   = '../data/input/srl_univprop_en.dev.conll'
path_example   = '../data/input/srl_univprop_en.example.conll'

In [3]:
# first exploration
c = 0
with open(path_train) as file:
    for line in file:
        print(line)
        c += 1
        
        if c >= 10:
            break

# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001

# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.

1	Al	Al	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No	_	_

2	-	-	PUNCT	HYPH	_	1	punct	1:punct	SpaceAfter=No	_	_

3	Zaman	Zaman	PROPN	NNP	Number=Sing	1	flat	1:flat	_	_	_

4	:	:	PUNCT	:	_	1	punct	1:punct	_	_	_

5	American	american	ADJ	JJ	Degree=Pos	6	amod	6:amod	_	_	_

6	forces	force	NOUN	NNS	Number=Plur	7	nsubj	7:nsubj	_	_	ARG0

7	killed	kill	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_	kill.01	V



## Conll Description

"Sentences consist of one or more word lines, and word lines contain the following fields:

ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0). <br>
FORM: Word form or punctuation symbol. <br>
LEMMA: Lemma or stem of word form. <br>
UPOS: Universal part-of-speech tag. <br>
XPOS: Language-specific part-of-speech tag; underscore if not available. <br>
FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. <br>
HEAD: Head of the current word, which is either a value of ID or zero (0). <br>
DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one. <br>
DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs. <br>
MISC: Any other annotation.

The fields DEPS and MISC replace the obsolete fields PHEAD and PDEPREL of the CoNLL-X format. In addition, we have modified the usage of the ID, FORM, LEMMA, XPOS, FEATS and HEAD fields as explained below.

The fields must additionally meet the following constraints:

Fields must not be empty.
Fields other than FORM, LEMMA, and MISC must not contain space characters.
Underscore (_) is used to denote unspecified values in all fields except ID. Note that no format-level distinction is made for the rare cases where the FORM or LEMMA is the literal underscore – processing in such cases is application-dependent. Further, in UD treebanks the UPOS, HEAD, and DEPREL columns are not allowed to be left unspecified except in multiword tokens, where all must be unspecified, and empty nodes, where UPOS is optional and HEAD and DEPREL must be unspecified. The enhanced DEPS annotation is optional in UD treebanks, but if it is provided, it must be provided for all sentences in the treebank. "

*** taken from https://universaldependencies.org/format.html

In [4]:
conll_header = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']

In [5]:
# retrieve longest line
# -> required for the creation of the dataframe later
def retrieveLength(path_to_file):
    c = 0
    max_line_length = -1
    sentences = 0
    tokens = 0
    with open(path_to_file) as file:
        for line in file:


            if line.startswith('# text'):
                sentences += 1
            elif line.startswith('#') or line.startswith('\n'):
                pass
            else:
                values = line.split('\t')
                line_length = len(values)
                if line_length > max_line_length:
                    max_line_length = line_length

                tokens += 1

            c += 1   
    
    print(f'# Sentences in file: {sentences}')
    print(f'# Tokens in file: {tokens}')
    print(f'Maxium of columns in file: {max_line_length}')
    
    return max_line_length

In [6]:
# conversion into dataframe
def createDataFrame(path_to_file, sentence_limit=None):

    max_line_length = retrieveLength(path_to_file)
    sentences = 0

    # create empty dataframe with known columns and fillers for remaining collumns
    headers_df = np.full(max_line_length, np.str)
    headers_df[:len(conll_header)] = conll_header
    headers_df[len(conll_header):] = '_'
    df = pd.DataFrame(columns=headers_df)


    # loop through file
    with open(path_to_file) as file:
        for line in file:

            # pass all other lines
            if line.startswith('# text'):
                sentences += 1
                
            elif line.startswith('#') or line.startswith('\n'):
                pass
            
            # only go into token lines
            else:

                if line.endswith('\n'):
                    line = line.replace('\n', '')
                values = np.array(line.split('\t'))

                array  = np.full(max_line_length, np.str)
                array[:len(values)] = values
                array[len(values):] = '_'

                df_entry = pd.DataFrame(columns=headers_df, data=[array])

                df = pd.concat([df, df_entry], axis = 0, ignore_index=True)

            if type(sentence_limit) and sentences > sentence_limit:
                break
                
        print(f'\n ## {sentences-1} sentences were added to dataframe.')

    return df

In [7]:
# call function with path to file and a integer that set the limit of sentences to include
df = createDataFrame(path_example, 5)

df.head(40)

# Sentences in file: 5
# Tokens in file: 46
Maxium of columns in file: 14

 ## 4 sentences were added to dataframe.


Unnamed: 0,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,_,_.1,_.2,_.3
0,1,Really,really,ADV,RB,_,2,advmod,2:advmod,_,_,ARGM-EXT,_,_
1,2,enjoyed,enjoy,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,0,root,0:root,_,enjoy.01,V,_,_
2,3,it,it,PRON,PRP,Case=Nom|Gender=Neut|Number=Sing|Person=3|Pron...,2,obj,2:obj,SpaceAfter=No,_,ARG1,_,_
3,4,.,.,PUNCT,.,_,2,punct,2:punct,_,_,_,_,_
4,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,compare.01,V,_,ARGM-ADV
5,2,to,to,ADP,IN,_,4,case,4:case,_,_,_,_,_
6,3,last,last,ADJ,JJ,Degree=Pos,4,amod,4:amod,_,_,_,_,_
7,4,decade,decade,NOUN,NN,Number=Sing,1,obl,1:obl:to,_,_,ARG2,_,_
8,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,_,_,_,_
9,6,University,University,PROPN,NNP,Number=Sing,8,nsubj,8:nsubj,_,_,_,_,ARG0


In [8]:
%%time
# call function with path to file and a integer that set the limit of sentences to include
df = createDataFrame(path_train, 100)

df.iloc[:,0:20].head(40)

# Sentences in file: 12530
# Tokens in file: 204235
Maxium of columns in file: 46

 ## 100 sentences were added to dataframe.
CPU times: user 14.3 s, sys: 197 ms, total: 14.5 s
Wall time: 14.7 s


Unnamed: 0,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,_,_.1,_.2,_.3,_.4,_.5,_.6,_.7,_.8,_.9
0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,_,_,_,_,_,_,_,_,_
1,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,_,_,_,_,_,_,_,_,_
2,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,_,_,_,_,_,_,_,_,_
3,4,:,:,PUNCT,:,_,1,punct,1:punct,_,_,_,_,_,_,_,_,_,_,_
4,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,_,_,_,_,_,_,_,_,_
5,6,forces,force,NOUN,NNS,Number=Plur,7,nsubj,7:nsubj,_,_,ARG0,_,_,_,_,_,_,_,_
6,7,killed,kill,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,parataxis,1:parataxis,_,kill.01,V,_,_,_,_,_,_,_,_
7,8,Shaikh,Shaikh,PROPN,NNP,Number=Sing,7,obj,7:obj,_,_,ARG1,_,_,_,_,_,_,_,_
8,9,Abdullah,Abdullah,PROPN,NNP,Number=Sing,8,flat,8:flat,_,_,_,_,_,_,_,_,_,_,_
9,10,al,al,PROPN,NNP,Number=Sing,8,flat,8:flat,SpaceAfter=No,_,_,_,_,_,_,_,_,_,_
