In [2]:
#fasttext installation
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l[K     |████▊                           | 10 kB 21.8 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 27.5 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 32.8 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 37.5 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 2.8 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.7.1-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3094357 sha256=a63d44734e4931ea534fa03cca846d011617e73e0458c1bdc9dd7d4d2d1f60ec
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a

In [3]:
# Importing libraries
# splitting in train-validation-test sets in a stratified manner.
from sklearn.model_selection import train_test_split
import numpy as np, pandas as pd
# NLP Preprocessing
from gensim.utils import simple_preprocess
import fasttext
import csv
from sklearn import metrics

In [4]:
import json
import pandas as pd

label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data = load_corpus('dataset_aueb_structure_v2.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data)} abstracts')
data.sample(5)

Dataset length: 1014 abstracts


Unnamed: 0,document,sentences,labels
963,doi: 10.5194/acp-19-11315-2019,[Retrieval of ice-nucleating particle concentr...,"[NEITHER, OBJECTIVE, BACKGROUND, BACKGROUND, B..."
519,doi: 10.1039/c6tc01409g,"[Design, synthesis, chemical stability, packin...","[NEITHER, OBJECTIVE, METHOD, RESULT, RESULT, R..."
199,doi: 10.1016/j.hal.2019.101655,[Climate change and harmful benthic microalgae...,"[NEITHER, BACKGROUND, RESULT, RESULT, RESULT, ..."
907,doi: 10.3389/fphys.2018.00213,[Distinct ECG Phenotypes Identified in Hypertr...,"[NEITHER, OBJECTIVE, OBJECTIVE, METHOD, METHOD..."
235,doi: 10.1016/j.matbio.2018.04.006,[Liver fibrosis: Direct antifibrotic agents an...,"[NEITHER, BACKGROUND, BACKGROUND, BACKGROUND, ..."


## Split Documents
For the cases we want the sentences separated, the following splits the documents. I keep the same document index in a new column in order to re-group the sentences to a document (e.g., after predictions).

In [5]:
#title Split to sentences
sentences = data['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})
sentences


Unnamed: 0,doc_id,sentence
0,0,Concordance Between Different Amyloid Immunoas...
1,0,Importance Visual assessment of amyloid positr...
2,0,Several immunoassays have been developed to me...
3,0,The agreement between CSF Aβ42 measures from d...
4,0,Objective To determine the concordance between...
...,...,...
10543,1013,"Instead, SBPs sample a range of conformations ..."
10544,1013,Certain non-transported ligands leave the stru...
10545,1013,"Intriguingly, in some cases, similar SBP confo..."
10546,1013,"In this case, the inability for transport aris..."


In [6]:
# and the corresponding labels
labels = data['labels'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'labels': 'label'})
labels

Unnamed: 0,doc_id,label
0,0,NEITHER
1,0,BACKGROUND
2,0,BACKGROUND
3,0,BACKGROUND
4,0,OBJECTIVE
...,...,...
10543,1013,METHOD
10544,1013,RESULT
10545,1013,RESULT
10546,1013,RESULT


In [7]:
sentences['sentence'] = sentences['sentence'].astype("string")
# remove any blanks from the start and the and of the string
sentences['sentence'] = sentences['sentence'].str.strip()
sentences["label"]=labels["label"]
data=sentences

In [8]:
#create new train and test set for fasttext
X_train_val, X_test, y_train_val, y_test = train_test_split(data['sentence'],
                                                            data['label'],
                                                            test_size=0.15,
                                                            random_state=42,
                                                            stratify=data['label'])

In [9]:
#create new train and validation set
X_train,X_val, y_train, y_val = train_test_split(X_train_val,
                                                y_train_val,
                                                test_size=0.15,
                                                random_state=42,
                                                stratify=y_train_val)

In [10]:
# Text Classification with fastText
# Importing libraries
import numpy as np, pandas as pd

# NLP Preprocessing
from gensim.utils import simple_preprocess

# Importing the dataset
df_train=pd.concat([X_train,y_train], axis=1)
df_val =pd.concat([X_val,y_val], axis=1)
df_test =pd.concat([X_test,y_test], axis=1)

# NLP Preprocess
df_train.iloc[:, 0] = df_train.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
df_val.iloc[:, 0] = df_val.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
df_test.iloc[:, 0] = df_test.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))


# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop = stopwords.words('english')
# df_train['sentence'] = df_train['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# df_val['sentence'] = df_val['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# df_test['sentence'] = df_test['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Prefixing each row of the category column with '__label__'
df_train.iloc[:, 1] = df_train.iloc[:, 1].apply(lambda x: '__label__' + x)
df_val.iloc[:, 1] = df_val.iloc[:, 1].apply(lambda x: '__label__' + x)
df_test.iloc[:, 1] = df_test.iloc[:, 1].apply(lambda x: '__label__' + x)

In [11]:

df_train[['label', 'sentence']].to_csv('train.txt', index = False, sep = ' ', header = None, 
                                          quoting = csv.QUOTE_NONE, quotechar = "",escapechar = " ")

df_test[['label', 'sentence']].to_csv('test.txt', index = False, sep = ' ', header = None, 
                                     quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")

df_val[['label', 'sentence']].to_csv('valid.txt', index = False, sep = ' ', header = None, 
                                     quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")



# Training the fastText classifier
model = fasttext.train_supervised('train.txt',autotuneValidationFile='valid.txt', autotuneDuration=600)

# Evaluating performance on train file

model.test('train.txt')#overfitting

(7620, 0.7161417322834646, 0.7161417322834646)

In [None]:
#find hyperprameters for optimized model
args_obj = model.f.getArgs()
for hparam in dir(args_obj):
    if not hparam.startswith('__'):
        print(f"{hparam} -> {getattr(args_obj, hparam)}")

autotuneDuration -> 300
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> 
bucket -> 0
cutoff -> 0
dim -> 110
dsub -> 2
epoch -> 5
input -> train.txt
label -> __label__
loss -> loss_name.softmax
lr -> 0.12
lrUpdateRate -> 100
maxn -> 0
minCount -> 1
minCountLabel -> 0
minn -> 0
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7fd9f13017b0>>
t -> 0.0001
thread -> 1
verbose -> 2
wordNgrams -> 1
ws -> 5


In [12]:
#set manually the hyperparameters to reduce overfitting
model = fasttext.train_supervised('train.txt',epoch=5,dim=110,lr=0.12)

In [13]:
#create a list with sentence and labels for train and test set
with open("train.txt") as file_in:
    lines_tr = []
    for line in file_in:
        line = line.rstrip('\n')
        lines_tr.append(line)

with open("test.txt") as file_in:
    lines_te = []
    for line in file_in:
        line = line.rstrip('\n')
        lines_te.append(line)

In [14]:
#predictions for training set
train_pred=[]
for i in lines_tr:
  x=model.predict(i)
  train_pred.append(x)# append tuple to list

tr_pred=[]
for a,b in train_pred:#tuple unpacking
   a=''.join(a)
   tr_pred.append(a)

#predictions for test set
test_pred=[]
for i in lines_te:
  x=model.predict(i)
  test_pred.append(x)#append tuple to list

ts_pred=[]
for a,b in test_pred:#tuple unpacking
   a=''.join(a)
   ts_pred.append(a)


In [15]:

print(metrics.classification_report(tr_pred,df_train['label']))#print precision,recall, f1-score
print()
print(metrics.confusion_matrix(tr_pred,df_train['label']))#print confusion matrix

                     precision    recall  f1-score   support

__label__BACKGROUND       0.81      0.63      0.71      1986
__label__CONCLUSION       0.13      0.66      0.22       176
    __label__METHOD       0.47      0.68      0.55       792
   __label__NEITHER       0.88      0.80      0.84       806
 __label__OBJECTIVE       0.71      0.74      0.72      1287
    __label__RESULT       0.85      0.65      0.74      2573

           accuracy                           0.68      7620
          macro avg       0.64      0.69      0.63      7620
       weighted avg       0.76      0.68      0.71      7620


[[1247  320  108   38  131  142]
 [  13  117   18    3   17    8]
 [  24   52  540    3   93   80]
 [  49   49   21  643   34   10]
 [  50   74  141   24  948   50]
 [ 155  285  330   22  117 1664]]


In [16]:

print(metrics.classification_report(ts_pred,df_test['label']))#print precision,recall, f1-score
print()
print(metrics.confusion_matrix(ts_pred,df_test['label']))#print confusion matrix

                     precision    recall  f1-score   support

__label__BACKGROUND       0.74      0.56      0.64       420
__label__CONCLUSION       0.11      0.68      0.19        31
    __label__METHOD       0.41      0.60      0.49       165
   __label__NEITHER       0.70      0.68      0.69       155
 __label__OBJECTIVE       0.62      0.62      0.62       277
    __label__RESULT       0.77      0.58      0.66       535

           accuracy                           0.60      1583
          macro avg       0.56      0.62      0.55      1583
       weighted avg       0.68      0.60      0.62      1583


[[236  68  33  11  31  41]
 [  4  21   2   0   1   3]
 [  3   8  99   6  24  25]
 [ 10  11   9 106  11   8]
 [ 18  17  36  15 173  18]
 [ 49  61  61  14  39 311]]
