# Data Exploration

In [11]:
import pandas as pd
import numpy as np

In [12]:
# variables
#path_train = '../data/input/srl_univprop_en.train.conll'
#path_dev   = '../data/input/srl_univprop_en.dev.conll'
path_example   = '../data/input/srl_univprop_en.example.conll'

path_train = '../data/input/en_ewt-up-train.conllu' 
path_test  = '../data/input/en_ewt-up-test.conllu'
path_dev   = '../data/input/en_ewt-up-dev.conllu'

In [13]:
def printLines(path):

    # first exploration
    c = 0
    with open(path) as file:
        for line in file:
            print(line)
            c += 1

            if c >= 10:
                break
                
        

In [14]:
import os

directory = os.fsencode('../data/input/')
    
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print(filename)

srl_univprop_en.dev.conll
.DS_Store
srl_univprop_en.example.conll
en_ewt-up-test.conllu
srl_univprop_en.train.conll
en_ewt-up-dev.conllu
en_ewt-up-train.conllu


In [15]:
#printLines(path_train)

print('\n#######\n')

printLines(path_train)


#######

# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001

# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.

1	Al	Al	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No	_	_

2	-	-	PUNCT	HYPH	_	1	punct	1:punct	SpaceAfter=No	_	_

3	Zaman	Zaman	PROPN	NNP	Number=Sing	1	flat	1:flat	_	_	_

4	:	:	PUNCT	:	_	1	punct	1:punct	_	_	_

5	American	american	ADJ	JJ	Degree=Pos	6	amod	6:amod	_	_	_

6	forces	force	NOUN	NNS	Number=Plur	7	nsubj	7:nsubj	_	_	ARG0

7	killed	kill	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_	kill.01	V



## Conll Description

"Sentences consist of one or more word lines, and word lines contain the following fields:

ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0). <br>
FORM: Word form or punctuation symbol. <br>
LEMMA: Lemma or stem of word form. <br>
UPOS: Universal part-of-speech tag. <br>
XPOS: Language-specific part-of-speech tag; underscore if not available. <br>
FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. <br>
HEAD: Head of the current word, which is either a value of ID or zero (0). <br>
DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one. <br>
DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs. <br>
MISC: Any other annotation.

The fields DEPS and MISC replace the obsolete fields PHEAD and PDEPREL of the CoNLL-X format. In addition, we have modified the usage of the ID, FORM, LEMMA, XPOS, FEATS and HEAD fields as explained below.

The fields must additionally meet the following constraints:

Fields must not be empty.
Fields other than FORM, LEMMA, and MISC must not contain space characters.
Underscore (_) is used to denote unspecified values in all fields except ID. Note that no format-level distinction is made for the rare cases where the FORM or LEMMA is the literal underscore – processing in such cases is application-dependent. Further, in UD treebanks the UPOS, HEAD, and DEPREL columns are not allowed to be left unspecified except in multiword tokens, where all must be unspecified, and empty nodes, where UPOS is optional and HEAD and DEPREL must be unspecified. The enhanced DEPS annotation is optional in UD treebanks, but if it is provided, it must be provided for all sentences in the treebank. "


*** taken from https://universaldependencies.org/format.html

In [43]:
# retrieved header according to documentation
conll_header = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']

# header from lecture form 25.02.
conll_header = ['id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space', 'predicate', 'label']

conll_header_adapted = ['sentenceId', 'id', 'form', 'lemma', 'upos', 'xpos', 'morph', 'head', 'dep', 'head_dep', 'space', 'predicate', 'label']


In [44]:
# retrieve longest line
# -> required for the creation of the dataframe later
def retrieveLength(path_to_file):
    c = 0
    max_line_length = -1
    sentences = 0
    tokens = 0
    with open(path_to_file) as file:
        for line in file:


            if line.startswith('# text'):
                sentences += 1
            elif line.startswith('#') or line.startswith('\n'):
                pass
            else:
                values = line.split('\t')
                line_length = len(values)
                if line_length > max_line_length:
                    max_line_length = line_length

                tokens += 1

            c += 1   
    
    print(f'# Sentences in file: {sentences}')
    print(f'# Tokens in file: {tokens}')
    print(f'Maxium of columns in file: {max_line_length}')
    
    return max_line_length

In [47]:
# conversion into dataframe
def createDataFrame(path_to_file, sentence_limit=None):

    max_line_length = retrieveLength(path_to_file)
    sentences = 0

    ### create header
    
    # create empty dataframe with known columns and fillers for remaining collumns
    headers_df = np.full(max_line_length + 1, np.str)  #  + 1 to add sentence column
    
    # add sentence column to header
    #headers_df[1] = 
    
    # add columns from identified columns
    headers_df[:len(conll_header_adapted)] = conll_header_adapted
    
    # fill remaining column headers with '_'
    headers_df[len(conll_header_adapted):] = '_'
    
    
    ### create dataframe
    df = pd.DataFrame(columns=headers_df)

    
    ### fill dataframe

    # loop through file
    with open(path_to_file) as file:
        for line in file:

            # pass all other lines
            if line.startswith('# text'):
                sentences += 1
                
            elif line.startswith('#') or line.startswith('\n'):
                pass
            
            # only go into token lines
            else:
                
                # omit linebreaks from some lines
                if line.endswith('\n'):
                    line = line.replace('\n', '')
                
                # split input line
                values = np.array(line.split('\t'))

                array  = np.full(max_line_length+1, np.str)
                
                # add sentenceId
                array[0] = sentences
                # add retrieved information from conll file
                array[1:len(values)+1] = values
                # fill remaining columns   !!** use np.nan ?! **!! 
                array[len(values):] = '_'
    
                # create new entry
                df_entry = pd.DataFrame(columns=headers_df, data=[array])

                # concatenate to large dataframe
                df = pd.concat([df, df_entry], axis = 0, ignore_index=True)

            if type(sentence_limit) and sentences > sentence_limit:
                break
                
        print(f'\n ## {sentences-1} sentences were added to dataframe.')

    return df

In [49]:
# call function with path to file and a integer that set the limit of sentences to include
df = createDataFrame(path_example, 5)

df.head(20)



# Sentences in file: 5
# Tokens in file: 46
Maxium of columns in file: 14

 ## 4 sentences were added to dataframe.


Unnamed: 0,sentenceId,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate,label,_,_.1
0,1,1,Really,really,ADV,RB,_,2,advmod,2:advmod,_,_,_,_,_
1,1,2,enjoyed,enjoy,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,0,root,0:root,_,enjoy.01,_,_,_
2,1,3,it,it,PRON,PRP,Case=Nom|Gender=Neut|Number=Sing|Person=3|Pron...,2,obj,2:obj,SpaceAfter=No,_,_,_,_
3,1,4,.,.,PUNCT,.,_,2,punct,2:punct,_,_,_,_,_
4,2,1,Compare,compare,VERB,VBN,Tense=Past|VerbForm=Part,8,advcl,8:advcl,_,compare.01,V,_,_
5,2,2,to,to,ADP,IN,_,4,case,4:case,_,_,_,_,_
6,2,3,last,last,ADJ,JJ,Degree=Pos,4,amod,4:amod,_,_,_,_,_
7,2,4,decade,decade,NOUN,NN,Number=Sing,1,obl,1:obl:to,_,_,ARG2,_,_
8,2,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,_,_,_,_
9,2,6,University,University,PROPN,NNP,Number=Sing,8,nsubj,8:nsubj,_,_,_,_,_


In [88]:
%%time
# call function with path to file and a integer that set the limit of sentences to include
df = createDataFrame(path_train, 1)

df.iloc[:,0:20].head(40)

# Sentences in file: 12543
# Tokens in file: 204609
Maxium of columns in file: 46

 ## 1 sentences were added to dataframe.
CPU times: user 567 ms, sys: 21.2 ms, total: 589 ms
Wall time: 628 ms


Unnamed: 0,sentenceId,id,form,lemma,upos,xpos,morph,head,dep,head_dep,space,predicate,label,_,_.1,_.2,_.3,_.4,_.5,_.6
0,1,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,_,_,_,_,_,_,_,_
1,1,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,_,_,_,_,_,_,_,_
2,1,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,_,_,_,_,_,_,_,_
3,1,4,:,:,PUNCT,:,_,1,punct,1:punct,_,_,_,_,_,_,_,_,_,_
4,1,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,_,_,_,_,_,_,_,_
5,1,6,forces,force,NOUN,NNS,Number=Plur,7,nsubj,7:nsubj,_,_,_,_,_,_,_,_,_,_
6,1,7,killed,kill,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,parataxis,1:parataxis,_,kill.01,_,_,_,_,_,_,_,_
7,1,8,Shaikh,Shaikh,PROPN,NNP,Number=Sing,7,obj,7:obj,_,_,_,_,_,_,_,_,_,_
8,1,9,Abdullah,Abdullah,PROPN,NNP,Number=Sing,8,flat,8:flat,_,_,_,_,_,_,_,_,_,_
9,1,10,al,al,PROPN,NNP,Number=Sing,8,flat,8:flat,SpaceAfter=No,_,_,_,_,_,_,_,_,_


In [58]:
### Looping through sentences & assigning new values

# assign df to variable for this loop
df = df  

# creat dummy columns for each new variable
# use this format:  df['columnName'] = np.nan
df['aFeature'] = np.nan

# loop through sentences
for s_id in df.sentenceId.unique():
    
    # filter for only this sentence
    df_sentence = df[df.sentenceId == s_id]
    
    # assign value to all features
    #df_sentence.aFeature = ... # uncomment
    
    # loop through lines if necesarry
    
    #display(df_sentence)

In [87]:
### split datasets for predicate prediction


# use test set 
df = createDataFrame(path_test, 2)

# cutt of predicate and argument rows
df_x = df.iloc[:,:11]

df_y_true = df.iloc[:,11]
y_true = [True if x != '_' else False for x in df_y]
y_true # maybe need to convert to np.array
y_true

# Sentences in file: 2077
# Tokens in file: 25097
Maxium of columns in file: 29

 ## 2 sentences were added to dataframe.


[False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]