### Model 3

https://huggingface.co/chkla/roberta-argument

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
import nltk
import math
from sklearn.feature_extraction.text import CountVectorizer
import torch

In [2]:
domain_1 = pd.read_csv('domain_1_set_clean.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])
domain_2 = pd.read_csv('domain_2_set_clean.csv', sep='\t', encoding='utf-8', usecols=['sentence', 'annotation'])

In [3]:
# drop nan sentences
domain_1 = domain_1[domain_1['sentence'].notna()]

In [4]:
# drop two arguments and two non-arguments
domain_1 = domain_1.drop(domain_1.index[[0,1,2,12095]])

In [5]:
domain_1["annotation"].value_counts()

non_argument    6048
argument        6048
Name: annotation, dtype: int64

In [6]:
len(domain_1)

12096

In [7]:
# drop nan sentences
domain_2 = domain_2[domain_2['sentence'].notna()]

In [8]:
# drop two arguments
domain_2 = domain_2.drop(domain_2.index[[0,3]])

In [9]:
domain_2["annotation"].value_counts()

non_argument    6048
argument        6048
Name: annotation, dtype: int64

In [10]:
len(domain_2)

12096

### Model

In [11]:
# drop nan sentences
domain_1 = domain_1[domain_1['sentence'].notna()]

In [12]:
# check if has worked
for sent in domain_1["sentence"]:
    if type(sent) == float:
        print(sent)

In [13]:
len(domain_1)

12096

In [14]:
domain_1

Unnamed: 0,sentence,annotation
3,cantons abrahamic religions judaism christiani...,non_argument
4,historians sympathized millau viaduct worlds p...,non_argument
5,mechanics ensured virginia twelve celebrities ...,non_argument
6,planets intended meaning free adopted mexican–...,non_argument
7,fiji francophonie canada additionally one gdp,non_argument
...,...,...
12094,limit use diverse product one use found object...,argument
12096,connect cheap offwhite paper known rhône valle...,non_argument
12097,continent equal zero chemical formula often,non_argument
12098,meant view penrose discusses long career also ...,non_argument


In [15]:
domain_2

Unnamed: 0,sentence,annotation
1,present systematic nursing hospitals primary p...,non_argument
2,distance fact form data transferred computer,non_argument
4,strategic bombing sailors sometimes also symbo...,non_argument
5,longer incubation alaska known fundamental int...,non_argument
6,confirm sponsored helped sardinian plotters or...,non_argument
...,...,...
12095,ask permission time allowed,argument
12096,children learn manage money young age,argument
12097,lowlevel employee desire enhance position fair...,argument
12098,among famous works protocubist les demoiselles...,non_argument


In [16]:
X_df_1 = list(domain_1['sentence'])
y_df_1 = list(domain_1['annotation'])

X_df_2 = list(domain_2['sentence'])
y_df_2 = list(domain_2['annotation'])

### Predict for Domain 1

In [17]:
tokenizer = AutoTokenizer.from_pretrained("chkla/roberta-argument")

model = AutoModelForSequenceClassification.from_pretrained("chkla/roberta-argument")

# apply tokenizer to list of sentences
inputs = tokenizer(X_df_1, padding = True, truncation = True, max_length = 512, return_tensors="pt")

# do prediction and store predicted values
predicted_labels = []

with torch.no_grad():

    outputs = model(**inputs)

    #print(outputs)

    predictions = torch.softmax(outputs.logits, dim=1)

    # print(predictions)

    labels = torch.argmax(predictions, dim=1)

    curr_label = [model.config.id2label[label_id] for label_id in labels.tolist()]

    predicted_labels.append(curr_label)

In [18]:
# map labels
predicted_labels_interpreted = []
for label in predicted_labels[0]:
    if label == 'NON-ARGUMENT':
        predicted_labels_interpreted.append('non_argument')
    else:
        predicted_labels_interpreted.append('argument')

In [19]:
# store predictions to existing dataframe
domain_1['predicted_labels'] = predicted_labels_interpreted

In [20]:
domain_1

Unnamed: 0,sentence,annotation,predicted_labels
3,cantons abrahamic religions judaism christiani...,non_argument,non_argument
4,historians sympathized millau viaduct worlds p...,non_argument,non_argument
5,mechanics ensured virginia twelve celebrities ...,non_argument,argument
6,planets intended meaning free adopted mexican–...,non_argument,non_argument
7,fiji francophonie canada additionally one gdp,non_argument,non_argument
...,...,...,...
12094,limit use diverse product one use found object...,argument,non_argument
12096,connect cheap offwhite paper known rhône valle...,non_argument,non_argument
12097,continent equal zero chemical formula often,non_argument,non_argument
12098,meant view penrose discusses long career also ...,non_argument,non_argument


In [21]:
y_true_1 = domain_1['annotation']
y_pred_1 = domain_1['predicted_labels']
domain_1['predicted_labels_model_3_domain_1'] = y_pred_1

In [22]:
print(confusion_matrix(y_true_1, y_pred_1))

[[4275 1773]
 [ 172 5876]]


In [23]:
print(classification_report(y_true_1, y_pred_1))

              precision    recall  f1-score   support

    argument       0.96      0.71      0.81      6048
non_argument       0.77      0.97      0.86      6048

    accuracy                           0.84     12096
   macro avg       0.86      0.84      0.84     12096
weighted avg       0.86      0.84      0.84     12096



In [24]:
domain_1.to_csv("M3_domain_1.csv", sep='\t', encoding='utf-8')

### Predict for Domain 2

In [25]:
tokenizer = AutoTokenizer.from_pretrained("chkla/roberta-argument")

model = AutoModelForSequenceClassification.from_pretrained("chkla/roberta-argument")

# apply tokenizer to list of sentences
inputs = tokenizer(X_df_2, padding = True, truncation = True, max_length = 512, return_tensors="pt")

# do prediction and store predicted values
predicted_labels = []

with torch.no_grad():

    outputs = model(**inputs)

    #print(outputs)

    predictions = torch.softmax(outputs.logits, dim=1)

    # print(predictions)

    labels = torch.argmax(predictions, dim=1)

    curr_label = [model.config.id2label[label_id] for label_id in labels.tolist()]

    predicted_labels.append(curr_label)

In [26]:
# map labels
predicted_labels_interpreted = []
for label in predicted_labels[0]:
    if label == 'NON-ARGUMENT':
        predicted_labels_interpreted.append('non_argument')
    else:
        predicted_labels_interpreted.append('argument')

In [27]:
# store predictions to existing dataframe
domain_2['predicted_labels'] = predicted_labels_interpreted

In [28]:
domain_2

Unnamed: 0,sentence,annotation,predicted_labels
1,present systematic nursing hospitals primary p...,non_argument,non_argument
2,distance fact form data transferred computer,non_argument,non_argument
4,strategic bombing sailors sometimes also symbo...,non_argument,non_argument
5,longer incubation alaska known fundamental int...,non_argument,non_argument
6,confirm sponsored helped sardinian plotters or...,non_argument,non_argument
...,...,...,...
12095,ask permission time allowed,argument,non_argument
12096,children learn manage money young age,argument,non_argument
12097,lowlevel employee desire enhance position fair...,argument,non_argument
12098,among famous works protocubist les demoiselles...,non_argument,non_argument


In [29]:
y_true_2 = domain_2['annotation']
y_pred_2 = domain_2['predicted_labels']
domain_2['predicted_labels_model_3_domain_2'] = y_pred_2

In [30]:
print(confusion_matrix(y_true_2, y_pred_2))

[[1648 4400]
 [ 173 5875]]


In [31]:
print(classification_report(y_true_2, y_pred_2))

              precision    recall  f1-score   support

    argument       0.90      0.27      0.42      6048
non_argument       0.57      0.97      0.72      6048

    accuracy                           0.62     12096
   macro avg       0.74      0.62      0.57     12096
weighted avg       0.74      0.62      0.57     12096



In [32]:
domain_2.to_csv("M3_domain_2.csv", sep='\t', encoding='utf-8')