In [1]:
import pandas as pd
import numpy as np
import torch
import re
import os
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from tqdm.autonotebook import tqdm

In [2]:
t = torch.cuda.get_device_properties(0).total_memory
print('Total Memory: {}'.format(t / 1e9))

Total Memory: 15.812263936


In [3]:
!pip install simpletransformers

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/77/c1/e34ae3bc246fb35b92c7f052705ffa88ec70bb998e9774a2dcff00465dad/simpletransformers-0.40.2-py3-none-any.whl (190kB)
[K     |████████████████████████████████| 194kB 9.3MB/s 
[?25hCollecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/6b/15/1c026f3aeafd26db30cb633d9915aae666a415179afa5943263e5dbd55a6/tokenizers-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 9.3MB/s 
[?25hCollecting transformers>=2.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 52.5MB/s 
Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Collecting tensorboardx
[?25l  D

In [4]:
# !pip install transformers==2.10.0

In [5]:
# !pip install simpletransformers==0.26.0

In [6]:
from simpletransformers.ner import NERModel
from simpletransformers.classification import ClassificationModel

In [7]:
class PipelineModel:
  def __init__(self, train, test, val, ner_type, ner_dir, class_type, class_dir):

    print("Treating Input DataFrames")
    self.train_df, self.train_codes = self._treat_dataframe(train)
    self.test_df, self.test_codes = self._treat_dataframe(test)
    self.eval_df, self.eval_codes = self._treat_dataframe(val)

    self.id_type, self.class_type = ner_type, class_type
    self.id_dir, self.class_dir = ner_dir, class_dir

    print("Building Span Identification Model")
    self.id_model = self._build_span_model()

    # id_model outputs
    self.id_results, self.id_preds = None, None
    self.id_df = None

    # identified spans
    self.perfect_spans, self.all_spans = None, None
    
    print("Building Span Classification Model")
    self.class_model = self._build_class_model()

    # class_model outputs
    self.pclass_results, self.aclass_results = None, None

    self.pmodel_outputs, self.amodel_outputs = None, None

    self.final_df = None


  def process(self):

    # Create necessary file in local directory
    os.makedirs(os.path.dirname("outputs/eval_results.txt"), exist_ok=True)
    with open("outputs/eval_results.txt", "w") as f:
        f.write("")

    # SPAN IDENTIFICATION
    print("SPAN IDENTIFICATION")
    self.id_result, _, self.id_preds = self.id_model.eval_model(self.test_df, verbose=True)
    self.id_df = self._build_id_df()
    self.perfect_spans, self.all_spans = self._process_span_id()

    print('\nSpan Identification macro-averaged F-Score: {}\n'.format(self.f_macro(self.id_df['true'], self.id_df['preds'])))

    # SPAN CLASSIFICATION
    print("SPAN CLASSIFICATION\n")
    le = LabelEncoder().fit(self.test_codes)
    self.perfect_spans['labels'] = le.transform(self.perfect_spans['code'])
    self.all_spans['labels'] = le.transform(self.all_spans['code'])

    # General Span Classification
    print("General Span Classification")
    df = self.all_spans.drop(columns=['sid', 'code'])
    self.aclass_results, self.amodel_outputs, _ = self.class_model.eval_model(df, acc=metrics.accuracy_score, f1M=self.f_macro)
    
    print("Classification of All Identified Spans:\n[Categories]\n{}\n[Domains]\n{}".format(self.aclass_results, self._process_span_class(self.all_spans, self.amodel_outputs)))

    pt_results, pt_outputs, _ = self.class_model.eval_model(df[df.country == 'P'], acc=metrics.accuracy_score, f1M=self.f_macro)
    print("Classification of Portuguese Identified Spans:\n[Categories]\n{}\n[Domains]\n{}".format(pt_results, self._process_span_class(self.all_spans[self.all_spans.country == 'P'], pt_outputs)))

    br_results, br_outputs, _ = self.class_model.eval_model(df[df.country == 'B'], acc=metrics.accuracy_score, f1M=self.f_macro)
    print("Classification of Brazilian Identified Spans:\n[Categories]\n{}\n[Domains]\n{}".format(br_results, self._process_span_class(self.all_spans[self.all_spans.country == 'B'], br_outputs)))

    it_results, it_outputs, _ = self.class_model.eval_model(df[df.country == 'S'], acc=metrics.accuracy_score, f1M=self.f_macro)
    print("Classification of Italian Identified Spans:\n[Categories]\n{}\n[Domains]\n{}".format(it_results, self._process_span_class(self.all_spans[self.all_spans.country == 'S'], it_outputs)))


    self.all_spans['preds'] = le.inverse_transform([np.argmax(self.softmax(logits)) for logits in self.amodel_outputs])

    self.final_df = self._build_final()
    self.final_df.to_csv('pipeline_results.csv', index=False)

  def _build_span_model(self):
    return NERModel(
                 self.id_type,
                 self.id_dir,
                 labels=['B', 'I', 'O'],
                 args={'train_batch_size': 32, 
                        'eval_batch_size': 32,
                        'num_train_epochs': 2,
                        'max_seq_length': 200,
                        'save_steps': 0,
                        'evaluate_during_training': True,
                        'evaluate_during_training_steps': int(self.train_df.shape[0] / 32),
                        'evaluate_during_training_verbose': True,
                        'fp16': False,
                        'overwrite_output_dir': True,
                        'reprocess_input_data': True,
                        'learning_rate': 2e-5,
                        'manual_seed':42
                        }
                 )
    
  def _build_class_model(self):
    return ClassificationModel(
                self.class_type,
                self.class_dir,
                num_labels=46,
                args={'train_batch_size': 32, 
                      'eval_batch_size': 32,
                      'num_train_epochs': 4,
                      'max_seq_length': 200,
                      'save_steps': 0,
                      'evaluate_during_training': True,
                      'evaluate_during_training_steps': 4422,
                      'evaluate_during_training_verbose': True,
                      'fp16': False,
                      'overwrite_output_dir': True,
                      'reprocess_input_data': True,
                      'learning_rate': 2e-5,
                      'manual_seed':42
                      })
      
  def _build_id_df(self):
    test_to_preds = {"sid": [], "words": [], "true": [], "preds": []}

    for i, sid in self.test_df.groupby(by='sentence_id'):
      if sid.shape[0] != len(self.id_preds[i]):
        self.id_preds[i] += self.id_preds[i][-1] * (sid.shape[0] - len(self.id_preds[i]))
      test_to_preds["sid"].extend(list(sid.sentence_id.values))
      test_to_preds["words"].extend(list(sid.words.values))
      test_to_preds["true"].extend(list(sid.labels.values))
      test_to_preds["preds"].extend(self.id_preds[i])

    id_df = pd.DataFrame(test_to_preds)
    id_df['codes'] = self.test_codes
    id_df['country'] = self.test_df.country

    return id_df

  
  def _build_final(self):

    pred_codes = self.all_spans.preds.to_list()

    tot_spans = []
    span_id = 0

    for _, sent_df in self.id_df.groupby(by='sid'):
      spans = self.get_spans(sent_df.preds, sent_df.codes)
      for start, end in zip(spans, spans[1:]):
        df_list = sent_df[start:end].values.tolist()
        for sublist in df_list:
          sublist.append(pred_codes[span_id])
        tot_spans.extend(df_list)
        span_id += 1
    return pd.DataFrame(tot_spans, columns=['sid', 'words', 'true_id', 'pred_id', 'true_code', 'country', 'pred_code'])


  def _process_span_id(self):
    perfect = []
    all_spans = []
    for sentence_id, sent_df in tqdm(self.id_df.groupby(by='sid')):
        sent_dict = sent_df.to_dict(orient='list')

        true_spans, pred_spans = [self.get_spans(sent_dict[bio], sent_dict["codes"]) for bio in ["true", "preds"]]

        for start, end in zip(pred_spans, pred_spans[1:]):
            all_spans.append([sentence_id,
                              #self.rebuild_span(sent_dict["words"][start:end]),
                              " ".join(sent_dict["words"][start:end]),
                              sent_df[start:end].codes.value_counts().keys()[0]])

        for start, end in zip(true_spans, true_spans[1:]):
            if (start, end) in zip(pred_spans, pred_spans[1:]) and self.perfect_span(sent_dict, start, end):
                perfect.append([sentence_id,
                                #self.rebuild_span(sent_dict["words"][start:end]),
                                " ".join(sent_dict["words"][start:end]),
                                sent_df[start:end].codes.value_counts().keys()[0]])
                
    return pd.DataFrame(perfect, columns=['sid', 'text', 'code']), pd.DataFrame(all_spans, columns=['sid', 'text', 'code'])

  def _process_span_class(self, test, model_outputs):
      label_to_code = dict(zip(test.labels, test.code))

      domain_to_label = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7:[], 9:[]}

      for label, code in label_to_code.items():
        domain = code // 100
        domain_to_label[domain].append(label)

      true_domains = []

      for label in test.labels:
        for domain, labels in domain_to_label.items():
          if label in labels:
            true_domains.append(domain)

      preds_bert = [np.argmax(self.softmax(logits)) for logits in model_outputs]

      preds_domain_bert = []

      for label in preds_bert:
        for domain, labels in domain_to_label.items():
          if label in labels:
            preds_domain_bert.append(domain)

      return self.classification_metrics(true_domains, preds_domain_bert)
  
  def _treat_dataframe(self, df):
    df.sentence_id = self.enc(df.sentence_id)
    df, df_codes = self.drop_codes(df)
    df.words = df.words.apply(lambda x: str(x))
    return df, df_codes

  def classification_metrics(self, true, preds):
    return {
            "acc": metrics.accuracy_score(true, preds),
            "f_score": self.f_macro(true, preds),
            "mcc": metrics.matthews_corrcoef(true, preds)
            }

  @staticmethod
  def enc(sid):
    le = LabelEncoder()
    sid = [str(id_) for id_ in sid]
    return le.fit_transform(sid)

  @staticmethod
  def drop_codes(df):
    return df.drop(['codes'], axis=1), df.codes
  
  @staticmethod
  def get_spans(labels, codes):
    limits = []
    prev = ""
    for idx, (lbl, code) in enumerate(zip(labels, codes)):
        if lbl == 'B' or (prev != code and lbl == 'O'):
            limits.append(idx)
        prev = code
    limits.append(len(labels))
    return limits

  @staticmethod
  def perfect_span(sent_dict, start, end):
    return sent_dict["true"][start:end] == sent_dict["preds"][start:end]

  @staticmethod
  def rebuild_span(words):
    text = " ".join([str(word) for word in words])
    text = re.sub("([(]) ", r"\1".rstrip(), text)
    return re.sub(" ([.,:;!?')])", r"\1".lstrip(), text)

  @staticmethod
  def f_macro(true, preds):
    return metrics.f1_score(true, preds, average='macro')

  @staticmethod
  def softmax(logits):
    return np.exp(logits) / np.sum(np.exp(logits), axis=0)

  @staticmethod
  def domain_label(label_to_code):
    domain_to_label = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7:[], 9:[]}
    for label, code in label_to_code.items():
      domain = code // 100
      domain_to_label[domain].append(label)
    return domain_to_label

In [8]:
test = pd.read_csv('drive/My Drive/data-ner/test_ner.csv')
train = pd.read_csv('drive/My Drive/data-ner/train_ner.csv')
val = pd.read_csv('drive/My Drive/data-ner/val_ner.csv')

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
t = pd.read_csv('drive/My Drive/data-class/test.csv', names=['text', 'Code'], header=0)
t["labels"] = le.fit_transform(t.Code)
t = t[["text", "labels"]]
t.head(5)

Unnamed: 0,text,labels
0,La ministra colombiana de Exteriores considera...,37
1,DEPORTES,31
2,É enorme a importância dessas atividades no de...,31
3,"Com o fim da guerra fria, iniciou-se um novo p...",5
4,Revelaram-se com inapagável nitidez novas teia...,13


In [10]:
# test.shape

In [11]:
# test = pd.concat([test, val]).reset_index(drop=True)

In [12]:
# test.shape

In [13]:
model = PipelineModel(train, test, val, 'bert', 'drive/My Drive/bert-ner/', 'bert', 'drive/My Drive/bert-class/')

Treating Input DataFrames
Building Span Identification Model
Building Span Classification Model


In [None]:
model.process()

SPAN IDENTIFICATION


In [None]:
# pred_codes = model.all_spans.preds.to_list()
# len(pred_codes)

In [None]:
# tot_spans = []
# span_id = 0
# for _, sent_df in model.id_df.groupby(by='sid'):
#   spans = model.get_spans(sent_df.preds, sent_df.codes)
#   for start, end in zip(spans, spans[1:]):
#     df_list = sent_df[start:end].values.tolist()
#     for sublist in df_list:
#       sublist.append(pred_codes[span_id])
#     tot_spans.extend(df_list)
#     span_id += 1
# len(tot_spans)

In [None]:
# df = pd.DataFrame(tot_spans, columns=['sid', 'words', 'true_id', 'pred_id', 'true_code', 'country', 'pred_code'])

In [None]:
# df.to_csv('pipeline.csv', index=False)