In [17]:
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
)
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from transformers import pipeline

In [18]:
model_directory="roberta_final_top_no_zero_save"
model = RobertaForSequenceClassification.from_pretrained(model_directory)
tokenizer= RobertaTokenizerFast.from_pretrained(model_directory, truncation=True)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
tokenizer_kwargs = {'truncation':True,'max_length':512,}

In [19]:
def get_model_labels(df,classifier):
    res=classifier(df.text.to_list(),**tokenizer_kwargs)
    df["model_label"]=[res[i]["label"] for i in range(0,len(df))]
    df["score"]=[res[i]["score"] for i in range(0,len(df))]
    #df=df.replace({"model_label":{0: "0",1: "E1", 2: "E2", 3: "E3", 4:"E4", 5 : "E5", 6: "S1", 7:"S2",8: "S3", 9: "S4", 10:"G1"}})
    df=df.replace({"model_label":{0: "E1", 1: "E2", 2: "E3", 3:"E4", 4 : "E5", 5: "S1", 6:"S2",7: "S3", 8: "S4", 9:"G1"}})
    return df

In [20]:
def evaluate(df):
    df['agreement'] = (df['label']==df['model_label']).astype(int)
    print("Cohen kappa score = {}".format(cohen_kappa_score(df.label,df.model_label)))
    print("Accuracy = {}".format(df.agreement.mean()))
    return df


In [21]:
test_set_top=pd.read_csv("..\EXPERIMENTAL\ex_sample_unlabeled.csv",index_col=0).reset_index(drop=True)

In [22]:
test_set_top.groupby("label")["text"].count()

label
0      36
E1    116
E2      6
E3     22
E4     11
E5     29
G1     66
S1     74
S2      4
S3      2
S4      3
Name: text, dtype: int64

In [23]:
test_set_top=evaluate(get_model_labels(test_set_top,classifier))

Cohen kappa score = 0.7840506954160151
Accuracy = 0.8265582655826558


In [24]:
evaluate(test_set_top[test_set_top["label"]!="0"])

Cohen kappa score = 0.891901918729349
Accuracy = 0.9159159159159159


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['agreement'] = (df['label']==df['model_label']).astype(int)


Unnamed: 0,text,label,model_label,score,agreement
0,"With special training programs, we ensure that...",S1,G1,0.977822,0
1,An important objective of our strategy is fost...,E5,E5,0.999718,1
2,We took a close look at our companys remunerat...,G1,G1,0.999825,1
3,"The health, safety and professional and person...",S1,S1,0.999748,1
4,Our actions are determined by the principles o...,G1,G1,0.999809,1
...,...,...,...,...,...
363,Our managers bear great responsibility for the...,S1,S1,0.999733,1
364,"We not only want to be the Employer of Choice,...",S1,S1,0.999713,1
365,We aim to cut the specific primary energy usag...,E1,E1,0.999819,1
367,In 2020 our annual employee survey included qu...,S1,G1,0.985651,0


In [25]:
test_set_mid=pd.read_csv("..\mid_sample.csv").reset_index(drop=True)

In [26]:
test_set_mid.groupby("label")["text"].count()

label
0     167
E1     47
E2      7
E3      5
E4      6
E5     15
G1     18
S1     56
S2     18
S3     14
S4     16
Name: text, dtype: int64

In [27]:
test_set_mid=evaluate(get_model_labels(test_set_mid,classifier))

Cohen kappa score = 0.37042765570685976
Accuracy = 0.42005420054200543


In [28]:
evaluate(test_set_mid[test_set_mid["label"]!="0"])

Cohen kappa score = 0.7245321340490352
Accuracy = 0.7673267326732673


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['agreement'] = (df['label']==df['model_label']).astype(int)


Unnamed: 0,text,label,model_label,score,agreement
0,"Twice a year, we offer our employees in German...",S1,S1,0.999736,1
1,Strong economic performance is a key prerequis...,G1,S4,0.999733,0
3,"In 2017, we procured a total of 131,618 MWh of...",E1,E1,0.999808,1
4,We engage closely with employee representative...,S1,S1,0.999729,1
5,Fluctuations in future cash flows resulting fr...,G1,E1,0.778985,0
...,...,...,...,...,...
357,G4-HR1 Investment agreements that include huma...,S2,S1,0.999683,0
360,Income taxes comprise the taxes levied on taxa...,G1,G1,0.999761,1
363,Nathalie Hideborg sees diversity as a key to c...,S1,S1,0.999768,1
367,The new area will strengthen our cross-organiz...,S4,S4,0.999467,1


In [40]:
test_set_top_mid=pd.concat([test_set_top,test_set_mid]).reset_index(drop=True)

In [41]:
evaluate(test_set_top_mid)

Cohen kappa score = 0.5705089691871943
Accuracy = 0.6233062330623306


Unnamed: 0,text,label,model_label,score,agreement
0,"With special training programs, we ensure that...",S1,G1,0.977822,0
1,An important objective of our strategy is fost...,E5,E5,0.999718,1
2,We took a close look at our companys remunerat...,G1,G1,0.999825,1
3,"The health, safety and professional and person...",S1,S1,0.999748,1
4,Our actions are determined by the principles o...,G1,G1,0.999809,1
...,...,...,...,...,...
733,RWE is an international group which including ...,0,S2,0.724211,0
734,We are involved in developing a solution for a...,0,E4,0.831466,0
735,These supplemental financial measures should n...,0,S1,0.999750,0
736,The new area will strengthen our cross-organiz...,S4,S4,0.999467,1


In [42]:
evaluate(test_set_top_mid[test_set_top_mid["label"]!="0"])

Cohen kappa score = 0.8280597859156861
Accuracy = 0.8598130841121495


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['agreement'] = (df['label']==df['model_label']).astype(int)


Unnamed: 0,text,label,model_label,score,agreement
0,"With special training programs, we ensure that...",S1,G1,0.977822,0
1,An important objective of our strategy is fost...,E5,E5,0.999718,1
2,We took a close look at our companys remunerat...,G1,G1,0.999825,1
3,"The health, safety and professional and person...",S1,S1,0.999748,1
4,Our actions are determined by the principles o...,G1,G1,0.999809,1
...,...,...,...,...,...
726,G4-HR1 Investment agreements that include huma...,S2,S1,0.999683,0
729,Income taxes comprise the taxes levied on taxa...,G1,G1,0.999761,1
732,Nathalie Hideborg sees diversity as a key to c...,S1,S1,0.999768,1
736,The new area will strengthen our cross-organiz...,S4,S4,0.999467,1


In [43]:
test_set_top_mid.groupby("label")["agreement"].mean()

label
0     0.000000
E1    0.926380
E2    0.923077
E3    0.925926
E4    1.000000
E5    0.909091
G1    0.869048
S1    0.830769
S2    0.318182
S3    0.562500
S4    0.947368
Name: agreement, dtype: float64

## Errors

In [44]:
test_set_top_mid[(test_set_top_mid["agreement"]==0) & ((test_set_top_mid["label"]!="0"))].groupby("label")["agreement"].count()

label
E1    12
E2     1
E3     2
E5     4
G1    11
S1    22
S2    15
S3     7
S4     1
Name: agreement, dtype: int64

In [45]:
test_set_top_mid[(test_set_top_mid["agreement"]==0) & ((test_set_top_mid["label"]=="E1"))].groupby("model_label")["agreement"].count()

model_label
E2    2
E3    2
E4    1
E5    2
S2    1
S4    4
Name: agreement, dtype: int64

In [52]:
df_errors=test_set_top_mid[(test_set_top_mid["agreement"]==0) & ((test_set_top_mid["label"]!="0"))].to_csv("errors.csv")