In [14]:
import math
import numpy as np
import pandas as pd

In [132]:
df = pd.read_csv("data/full_data_EN.csv")
df.shape

(18218, 4)

In [90]:
df.head()

Unnamed: 0,Sentence,Word,hierarchical,non-hierarchical
0,0,active,DEFINIENDUM,
1,0,cave,DEFINIENDUM,
2,0,:,,
3,0,Cave,GENUS,
4,0,containing,,HAS_FUNCTION


In [106]:
# number of non-hierarchical tags
df['non-hierarchical'].value_counts(dropna=False)

NaN                   7936
HAS_CAUSE             2158
HAS_LOCATION          1832
HAS_FORM              1741
COMPOSITION_MEDIUM     923
HAS_FUNCTION           885
CONTAINS               604
HAS_SIZE               581
DEFINED_AS             527
HAS_ATTRIBUTE          356
HAS_RESULT             200
OCCURS_IN_TIME         154
MEASURES               123
HAS_POSITION           105
STUDIES                 73
AFFECTS                 17
OCCURS_IN_MEDIUM         3
Name: non-hierarchical, dtype: int64

In [107]:
# number of missing values in each colum
df.isna().sum().sort_values()

Sentence                0
Word                    0
non-hierarchical     7936
hierarchical        15253
dtype: int64

In [108]:
# number of different sentences
df['Sentence'].nunique()

745

In [109]:
# number of sentences with each hierarchical tag
df.groupby('hierarchical')['Sentence'].nunique().sort_values(ascending=False)

hierarchical
DEFINIENDUM    741
GENUS          615
DEFINITOR      275
Name: Sentence, dtype: int64

In [110]:
# number of sentences with each non-hierarchical tag
df.groupby('non-hierarchical')['Sentence'].nunique().sort_values(ascending=False)

non-hierarchical
HAS_FORM              288
HAS_LOCATION          282
HAS_CAUSE             222
COMPOSITION_MEDIUM    157
HAS_FUNCTION          101
HAS_SIZE              100
CONTAINS               75
HAS_ATTRIBUTE          56
DEFINED_AS             48
HAS_RESULT             21
HAS_POSITION           18
OCCURS_IN_TIME         16
MEASURES               10
STUDIES                 7
AFFECTS                 2
OCCURS_IN_MEDIUM        1
Name: Sentence, dtype: int64

In [114]:
# find sentences without definiendum. 4 sentences don't have definiendum (745 - 741). check those
idx = df.groupby('Sentence')['hierarchical'].agg(lambda x: 'DEFINIENDUM' not in list(x))
idx = idx[idx==True].index
df[df['Sentence'].isin(idx)].groupby('Sentence')['Word'].agg(lambda x: ' '.join(list(x)))

Sentence
51     Dolines are also sometimes known as sinkholes ...
433    In the first edition of their book ( Ford & Wi...
589    column : A flowstone formation , generally cyl...
741        creating coatings , waterfalls and canopies .
Name: Word, dtype: object

In [32]:
# remove sentences without definiendum
df = df[~df['Sentence'].isin(idx)]

## Check if hierarchical tag coincides with non-hierarchical

In [125]:
cc = df[(~df['hierarchical'].isna()) & (~df['non-hierarchical'].isna())]

In [126]:
# number of words that each hierarchical tag coincides
cc['hierarchical'].value_counts()

GENUS          415
DEFINIENDUM      7
DEFINITOR        7
Name: hierarchical, dtype: int64

In [127]:
# check number of sentences that each non-hieararchical tag coincides
cc.groupby('hierarchical')['Sentence'].nunique().sort_values(ascending=False)

hierarchical
GENUS          200
DEFINITOR        5
DEFINIENDUM      3
Name: Sentence, dtype: int64

Its mostly just GENUS that coincides with another non-hierarchical tag

In [128]:
# check words on which definiendum coincides with top 4 (most frequent) non-hier tags
cc[(cc['hierarchical'].isin(['DEFINIENDUM'])) & (cc['non-hierarchical'].isin(['HAS_CAUSE', 'HAS_LOCATION', 'HAS_CAUSE', 'COMPOSITION_MEDIUM']))]

Unnamed: 0,Sentence,Word,hierarchical,non-hierarchical
7072,287,bowlshaped,DEFINIENDUM,HAS_CAUSE
7073,287,doline,DEFINIENDUM,HAS_CAUSE


In [131]:
# check sentence with definiendum coinciding (there's only one)
print(' '.join(df[df['Sentence'].isin([287])]['Word'].values))

In instances where dissolution is the prevailing mechanism , a bowlshaped doline will probably form .


# Prepare dataframe for training

One data frame will have hierarchical tags (DEFINIENDUM , GENUS, DEFINITOR),
the other frame will have non-hierarchical tags (most frequent - HAS_FORM, HAS_CAUSE, HAS_LOCATION, COMPOSITION_MEDIUM) + DEFINIENDUM
Two frames are needed, because sometimes GENUS and some hierarchical tag are coinciding.

## Hierarchical frame

In [34]:
df_h = df.rename(columns={'hierarchical': 'Tag'}).drop(columns=['non-hierarchical'])
df_h.isna().sum()

Sentence        0
Word            0
Tag         15149
dtype: int64

In [36]:
# fill nans with 'O' - Other
df_h = df_h.fillna('O')

In [38]:
df_h['Tag'].value_counts()

O              15149
DEFINIENDUM     1245
GENUS           1078
DEFINITOR        640
Name: Tag, dtype: int64

In [39]:
df_h.head()

Unnamed: 0,Sentence,Word,Tag
0,0,active,DEFINIENDUM
1,0,cave,DEFINIENDUM
2,0,:,O
3,0,Cave,GENUS
4,0,containing,O


In [40]:
# save as csv
df_h.to_csv('data/karst_hierarchical_EN.csv', index=False)

## Non-hierarchical dataframe

In [43]:
df_nh = df.copy()
# take the definiendum or the non-hierarchical tag
df_nh['Tag'] = df.apply(lambda x: x['hierarchical'] if x['hierarchical'] == 'DEFINIENDUM' else x['non-hierarchical'], axis=1)

In [56]:
# only take the most common non-hierarchical tags
df_nh.loc[df_nh['non-hierarchical'].isin([' HAS_FORM' 'HAS_LOCATION' 'HAS_CAUSE']), 'Tag'] = np.nan

In [59]:
# drop unwanted columns
df_nh = df_nh.drop(columns=['hierarchical', 'non-hierarchical'], errors='ignore')

In [60]:
# fill nan tags with 'O' - Other
df_nh['Tag'] = df_nh['Tag'].fillna('O')

In [61]:
# sanity check
df_nh.isna().sum()

Sentence    0
Word        0
Tag         0
dtype: int64

In [62]:
df_nh.to_csv('data/karst_nonhierarchical_EN.csv', index=False)

# NEW DATA

In [69]:
df = pd.read_csv("data/full_data_new.csv")
df.shape

(2189, 4)

In [70]:
df['hierarchical'].unique()

array(['DEFINIENDUM', 'DEFINITOR', 'GENUS', nan], dtype=object)

In [71]:
df.groupby('non-hierarchical')['Sentence'].nunique().sort_values(ascending=False)

non-hierarchical
HAS_CAUSE             31
HAS_LOCATION          26
COMPOSITION_MEDIUM    24
HAS_FORM              20
HAS_RESULT            15
HAS_FUNCTION          14
HAS_ATTRIBUTE         12
HAS_SIZE              11
CONTAINS               6
MEASURES               5
DEFINED_AS             4
OCCURS_IN_TIME         2
AFFECTS                2
STUDIES                1
Name: Sentence, dtype: int64

In [None]:
import pandas as pd

In [None]:
df1= pd.read_csv('data/experiments/EN_def+gen_btag/allenai_scibert_scivocab_cased/annotation.csv')

In [None]:
df2 = pd.read_csv('data/experiments/EN_def+gen_btag/test.csv')

In [None]:
for sentence_id in df1['Sentence'].unique():
    words1 = df1[df1['Sentence'] == sentence_id]['Word']
    words2 = df2[df2['Sentence'] == sentence_id]['Word']
    for w1, w2 in zip(words1, words2):
        if w1 != w2:
            print(sentence_id, w1, w2)
            break

In [None]:
df1[df1['Sentence']>=65]

In [None]:
df2[df2['Sentence']>=65]

In [3]:
df1= pd.read_csv('data/experiments/EN_def+gen_btag/allenai_scibert_scivocab_cased/annotation.csv')

In [4]:
df2 = pd.read_csv('data/experiments/EN_def+gen_btag/test.csv')

In [38]:
for sentence_id in df1['Sentence'].unique():
    words1 = df1[df1['Sentence'] == sentence_id]['Word']
    words2 = df2[df2['Sentence'] == sentence_id]['Word']
    for w1, w2 in zip(words1, words2):
        if w1 != w2:
            print(sentence_id, w1, w2)
            break

63 0 0.002
65 15 15%
67 ##outh baymouth
73 sediments-mainly sediments
91 earth earth's


In [32]:
df1[df1['Sentence']>=65]

Unnamed: 0,Sentence,Word,Tag
1354,65,The,O
1355,65,term,O
1356,65,arenite,B-DEFINIENDUM
1357,65,applies,O
1358,65,to,O
...,...,...,...
2185,102,metamorphism,O
2186,102,of,O
2187,102,volcanic,O
2188,102,rocks,O


In [31]:
df2[df2['Sentence']>=65]

Unnamed: 0,Sentence,Word,Tag
1352,65,The,O
1353,65,term,O
1354,65,arenite,B-DEFINIENDUM
1355,65,applies,O
1356,65,to,O
...,...,...,...
2184,102,metamorphism,O
2185,102,of,O
2186,102,volcanic,O
2187,102,rocks,O
