In [None]:
import pandas as pd
from sklearn.metrics import f1_score, recall_score, classification_report
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from matplotlib import pyplot
from collections import Counter

In [None]:
categories = {
    'Promotingdiscussion':
     ["CounterArgument", "Clarification", "RequestClarification", "Extension", "Answer", "AttackValidity", "Moderation", "Personal", "ViableTransformation"],
     'Low responsiveness':
     ["Convergence", "NegTransformation", "NoReasonDisagreement", "AgreeToDisagree", "Repetition", "BAD"],
     'Tone & Style':
     ["Complaint", "Positive", "Aggressive", "Sarcasm", "WQualifiers", "Ridicule"],
    'Easing tension': 
    ["Sources", "Softening", "DoubleVoicing", "AgreeBut"],
    'Intensifying Tension':
    ["Nitpicking", "CriticalQuestion", "DirectNo", "Irrelevance", "Alternative", "RephraseAttack"]

}

In [None]:
t_name = ['Aggressive', 'AgreeBut', 'AgreeToDisagree', 'Alternative',
       'Answer', 'AttackValidity', 'BAD', 'Clarification', 'Complaint',
       'Convergence', 'CounterArgument', 'CriticalQuestion', 'DirectNo',
       'DoubleVoicing', 'Extension', 'Irrelevance', 'Moderation',
       'NegTransformation', 'Nitpicking', 'NoReasonDisagreement', 'Personal',
       'Positive', 'Repetition', 'RephraseAttack', 'RequestClarification',
       'Ridicule', 'Sarcasm', 'Softening', 'Sources', 'ViableTransformation',
       'WQualifiers']

In [None]:
def func(x):
  return list([float(i) for i in x.replace('\n', '').replace('[', '').replace(']', '').split(' ') if i != ''])

In [None]:
def convert(df):
  df['prob-predict'] = df['prob-predict'].apply(lambda x: func(x))
  df['true-label'] = df['true-label'].apply(lambda x: func(x))
  return df

In [None]:
train_albert = convert(pd.read_csv('train_albert.csv'))
test_albert = convert(pd.read_csv('test_albert.csv'))

train_roberta = convert(pd.read_csv('train_roberta.csv'))
test_roberta = convert(pd.read_csv('test_roberta.csv'))

In [None]:
_c = [f'{x}_albert' for x in t_name] + [f'{x}_roberta' for x in t_name] + t_name

train_rows = []
for idx in range(len(train_albert)):
  train_rows.append(train_albert['prob-predict'][idx] + train_roberta['prob-predict'][idx] + train_albert['true-label'][idx])

test_rows = []
for idx in range(len(test_albert)):
  test_rows.append(test_albert['prob-predict'][idx] + test_roberta['prob-predict'][idx] + test_albert['true-label'][idx])

train_df = pd.DataFrame(train_rows, columns = _c)
test_df = pd.DataFrame(test_rows, columns = _c)

In [None]:
X = train_df[[f'{x}_albert' for x in t_name] + [f'{x}_roberta' for x in t_name]]
y = train_df[t_name]
clf = MultiOutputClassifier(xgb.XGBClassifier())
# moel.fit(X, y)
#clf = MultiOutputClassifier(LogisticRegression(solver='lbfgs', max_iter=1000)).fit(X, y)
# clf = MultiOutputClassifier(RandomForestClassifier(bootstrap=True, max_depth=70, max_features='auto', min_samples_leaf=4, min_samples_split=10, n_estimators=400))
clf.fit(X, y)

MultiOutputClassifier(estimator=XGBClassifier())

In [None]:
clf.estimators_[0].feature_importances_

array([0.04571972, 0.02154673, 0.01086542, 0.01515489, 0.        ,
       0.01974204, 0.01811899, 0.01415925, 0.02772407, 0.00970944,
       0.01349302, 0.        , 0.01721682, 0.01113943, 0.01861747,
       0.01654915, 0.00437542, 0.0132073 , 0.01898925, 0.01858661,
       0.01681464, 0.01381984, 0.01056313, 0.01220473, 0.01815817,
       0.01571908, 0.01721885, 0.01442042, 0.01036456, 0.00185069,
       0.0151953 , 0.07898295, 0.01128139, 0.01330814, 0.01413496,
       0.01751835, 0.0149625 , 0.01585782, 0.01386907, 0.022943  ,
       0.01793547, 0.02504605, 0.01549398, 0.02115137, 0.02051779,
       0.01721355, 0.01518065, 0.0117052 , 0.01598461, 0.03311272,
       0.01231526, 0.01938961, 0.01217069, 0.01237457, 0.01465809,
       0.01245774, 0.00899449, 0.00785571, 0.00991169, 0.01304277,
       0.0121067 , 0.00727872], dtype=float32)

In [None]:
feat_impts = [] 
top_1_dict, top_2_dict = {}, {}
for index, c in enumerate(clf.estimators_):
  # print(t_name[index])
  # print([train_df.columns[i] for i in np.argpartition(c.feature_importances_, -2)[-2:]])
  top_1_dict[t_name[index]] = train_df.columns[np.argmax(c.feature_importances_)]
  top_2_dict[t_name[index]] = [train_df.columns[i] for i in np.argpartition(c.feature_importances_, -2)[-2:]]
  # print(c.feature_importances_)
  feat_impts.append(c.feature_importances_)

a = np.mean(feat_impts, axis=0)

In [None]:
res_dict = {key: [] for key in categories.keys()}

In [None]:
for key,value in categories.items():
  for v in value:
    print(top_1_dict[v].split('_')[1])
    res_dict[key].append(top_1_dict[v].split('_')[1])

In [None]:
for key,value in res_dict.items():
  print(key)
  c = Counter(value)
  print(c)

Promotingdiscussion
Counter({'albert': 6, 'roberta': 3})
Low responsiveness
Counter({'albert': 4, 'roberta': 2})
Tone & Style
Counter({'albert': 4, 'roberta': 2})
Easing tension
Counter({'roberta': 2, 'albert': 2})
Intensifying Tension
Counter({'roberta': 4, 'albert': 2})


In [None]:
for key,value in top_2_dict.items():
  for v in value:
    if v.split('_')[0] != key:
      print(key,value)

Alternative ['RequestClarification_roberta', 'Alternative_albert']
DoubleVoicing ['NegTransformation_albert', 'DoubleVoicing_albert']
Nitpicking ['Irrelevance_roberta', 'Nitpicking_roberta']
Repetition ['Complaint_roberta', 'Repetition_albert']
ViableTransformation ['Sarcasm_roberta', 'ViableTransformation_albert']


In [None]:
preds = clf.predict(train_df[[f'{x}_albert' for x in t_name] + [f'{x}_roberta' for x in t_name]])

In [None]:
s_df = pd.DataFrame()

In [None]:
s_df['prob-predict'] = [list(x) for x in preds]
s_df['true-label'] = test_roberta['true-label']

In [None]:
for threshold in [0.6]:
  print(f'Threshold: {threshold}')
  label_dict = {}
  pred_dict = {}
  for row in s_df.itertuples():
    preds = row[1]
    labels = row[2]
    for idx in range(len(preds)):
        pred_dict[idx] = pred_dict.get(idx, []) + [1 if preds[idx] > threshold else 0]
        label_dict[idx] = label_dict.get(idx, []) + [labels[idx]]
  print(classification_report(np.array([label_dict[key] for key in label_dict]).T, np.array([pred_dict[key] for key in pred_dict]).T,target_names=t_name))
  d = classification_report(np.array([label_dict[key] for key in label_dict]).T, np.array([pred_dict[key] for key in pred_dict]).T,target_names=t_name, output_dict=True)