In [None]:
#fasttext installation
!pip install fasttext

In [None]:
# Importing libraries
# splitting in train-validation-test sets in a stratified manner.
from sklearn.model_selection import train_test_split
import numpy as np, pandas as pd
# NLP Preprocessing
from gensim.utils import simple_preprocess
import fasttext
import csv
from sklearn import metrics

In [1]:
import json
import pandas as pd

label2id = {
    'NONE': 0,
    'EVIDENCE': 1,
    'CLAIM': 2}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

data_v1 = load_corpus('dataset.json') #, label_mapping=label2id)
data_v3= load_corpus('dataset_aueb_argument_v3.json') #, label_mapping=label2id)
print(f'Dataset length: {len(data_v1)+ len(data_v3)} abstracts')
data_v1.head(5)

Dataset length: 2686 abstracts


Unnamed: 0,document,sentences,labels
0,DEI_G2B1_15.txt,[Gender Differences in Anxiety and Depression ...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."
1,DEI_G2B1_23.txt,"[Women's economic empowerment, participation i...","[NONE, NONE, NONE, NONE, NONE, NONE, NONE, EVI..."
2,DEI_G2B1_24.txt,[Forced sterilization of women as discriminati...,"[NONE, NONE, NONE, NONE, NONE, NONE, NONE, NON..."
3,DEI_G2B1_31.txt,[Relationship of gender differences in prefere...,"[NONE, NONE, NONE, NONE, NONE, EVIDENCE, CLAIM]"
4,DEI_G2B1_39.txt,"[Women’s Assessments of Gender Equality, Abstr...","[NONE, NONE, NONE, NONE, EVIDENCE, EVIDENCE, C..."


## Split Documents
For the cases we want the sentences separated, the following splits the documents. I keep the same document index in a new column in order to re-group the sentences to a document (e.g., after predictions).

In [None]:
#@title Split to sentences
sentences_v1 = data_v1['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})

sentences_v3 = data_v3['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})




In [None]:
#@title and the corresponding labels
labels_v1 = data_v1['labels'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'labels': 'label'})

labels_v3 = data_v3['labels'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'labels': 'label'})


In [None]:

#conver the sentence column from object to string
sentences_v1['sentence'] = sentences_v1['sentence'].astype("string")
sentences_v3['sentence'] = sentences_v3['sentence'].astype("string")
# remove any blanks from the start and the and of the string
sentences_v1['sentence'] = sentences_v1['sentence'].str.strip()
sentences_v3['sentence'] = sentences_v3['sentence'].str.strip()
sentences_v1["label"]=labels_v1["label"]
sentences_v3["label"]=labels_v3["label"]
#Coancatenate the two dataframes
data = sentences_v1.append(sentences_v3, ignore_index=True)
#repalce the none labels with neither
data['label'] = data['label'].str.replace('NONE','NEITHER')
data.head()

Unnamed: 0,doc_id,sentence,label
0,0,Gender Differences in Anxiety and Depression b...,NEITHER
1,0,Abstract,NEITHER
2,0,Background/aims: The aim of this prospective s...,NEITHER
3,0,"Methods: AUD severity, state and trait anxiety...",NEITHER
4,0,Follow-up assessments were performed at 6 and ...,NEITHER


In [None]:
#create new train and test set for fasttext
X_train_val, X_test, y_train_val, y_test = train_test_split(data['sentence'],
                                                            data['label'],
                                                            test_size=0.15,
                                                            random_state=42,
                                                            stratify=data['label'])

In [None]:
#create new train and validation set
X_train,X_val, y_train, y_val = train_test_split(X_train_val,
                                                y_train_val,
                                                test_size=0.15,
                                                random_state=42,
                                                stratify=y_train_val)

In [None]:

# Importing the dataset
df_train=pd.concat([X_train,y_train], axis=1)
df_val =pd.concat([X_val,y_val], axis=1)
df_test =pd.concat([X_test,y_test], axis=1)

# NLP Preprocess
df_train.iloc[:, 0] = df_train.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
df_val.iloc[:, 0] = df_val.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
df_test.iloc[:, 0] = df_test.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# #import nlp package for stopwords
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop = stopwords.words('english')
# #Remove stopwords
# df_train['sentence'] = df_train['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# df_val['sentence'] = df_val['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# df_test['sentence'] = df_test['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Prefixing each row of the category column with '__label__'
df_train.iloc[:, 1] = df_train.iloc[:, 1].apply(lambda x: '__label__' + x)
df_val.iloc[:, 1] = df_val.iloc[:, 1].apply(lambda x: '__label__' + x)
df_test.iloc[:, 1] = df_test.iloc[:, 1].apply(lambda x: '__label__' + x)

In [None]:
# Saving the CSV file as a text file to train/valid the classifier

df_train[['label', 'sentence']].to_csv('train.txt', index = False, sep = ' ', header = None, 
                                          quoting = csv.QUOTE_NONE, quotechar = "",escapechar = " ")

df_test[['label', 'sentence']].to_csv('test.txt', index = False, sep = ' ', header = None, 
                                     quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")

df_val[['label', 'sentence']].to_csv('valid.txt', index = False, sep = ' ', header = None, 
                                     quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")


# Training the fastText classifier
model = fasttext.train_supervised('train.txt',autotuneValidationFile='valid.txt', autotuneDuration=600)




In [None]:
model.test('train.txt')

(23122, 0.8169708502724677, 0.8169708502724677)

In [None]:
model.test('valid.txt')

(4081, 0.7755452095074736, 0.7755452095074736)

In [None]:
#find hyperprameters for optimized model
args_obj = model.f.getArgs()
for hparam in dir(args_obj):
    if not hparam.startswith('__'):
        print(f"{hparam} -> {getattr(args_obj, hparam)}")

autotuneDuration -> 600
autotuneMetric -> f1
autotuneModelSize -> 
autotunePredictions -> 1
autotuneValidationFile -> valid.txt
bucket -> 3179202
cutoff -> 0
dim -> 109
dsub -> 2
epoch -> 12
input -> train.txt
label -> __label__
loss -> loss_name.softmax
lr -> 0.1147434290700611
lrUpdateRate -> 100
maxn -> 6
minCount -> 1
minCountLabel -> 0
minn -> 3
model -> model_name.supervised
neg -> 5
output -> 
pretrainedVectors -> 
qnorm -> False
qout -> False
retrain -> False
saveOutput -> False
seed -> 0
setManual -> <bound method PyCapsule.setManual of <fasttext_pybind.args object at 0x7fde46dab270>>
t -> 0.0001
thread -> 1
verbose -> 2
wordNgrams -> 1
ws -> 5


In [None]:
#create a list with sentence and labels for train and test set
with open("train.txt") as file_in:
    lines_tr = []
    for line in file_in:
        line = line.rstrip('\n')
        lines_tr.append(line)

with open("test.txt") as file_in:
    lines_te = []
    for line in file_in:
        line = line.rstrip('\n')
        lines_te.append(line)

In [None]:
#predictions for training set
train_pred=[]
for i in lines_tr:
  x=model.predict(i)
  train_pred.append(x)# append tuple to list

tr_pred=[]
for a,b in train_pred:#tuple unpacking
   a=''.join(a)
   tr_pred.append(a)

#predictions for test set
test_pred=[]
for i in lines_te:
  x=model.predict(i)
  test_pred.append(x)#append tuple to list

ts_pred=[]
for a,b in test_pred:#tuple unpacking
   a=''.join(a)
   ts_pred.append(a)


In [None]:

print(metrics.classification_report(tr_pred,df_train['label']))
print()
print(metrics.confusion_matrix(tr_pred,df_train['label']))

                   precision    recall  f1-score   support

   __label__CLAIM       0.34      0.71      0.46      1193
__label__EVIDENCE       0.61      0.76      0.67      3590
 __label__NEITHER       0.95      0.84      0.89     18339

         accuracy                           0.82     23122
        macro avg       0.63      0.77      0.67     23122
     weighted avg       0.86      0.82      0.83     23122


[[  846   108   239]
 [  265  2721   604]
 [ 1359  1657 15323]]


In [None]:

print(metrics.classification_report(ts_pred,df_test['label']))
print()
print(metrics.confusion_matrix(ts_pred,df_test['label']))

                   precision    recall  f1-score   support

   __label__CLAIM       0.27      0.54      0.36       253
__label__EVIDENCE       0.51      0.65      0.57       737
 __label__NEITHER       0.91      0.80      0.85      3811

         accuracy                           0.77      4801
        macro avg       0.56      0.67      0.60      4801
     weighted avg       0.82      0.77      0.79      4801


[[ 137   32   84]
 [  49  479  209]
 [ 327  421 3063]]
