In [None]:
import pandas as pd
import numpy as np
import torch
import nltk
import re
import os
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from tqdm.autonotebook import tqdm
from nltk import word_tokenize, sent_tokenize

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
print('Total Memory: {}'.format(t / 1e9))

Total Memory: 15.812263936


In [None]:
torch.cuda.get_device_properties(0)

_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15079MB, multi_processor_count=40)

In [None]:
%%capture
!pip install simpletransformers==0.49.3

In [None]:
# !pip install transformers==2.10.0

In [None]:
# !pip install simpletransformers==0.26.0

In [None]:
from google.colab import output

In [None]:
from transformers import BertTokenizerFast

In [None]:
from simpletransformers.ner import NERModel
from simpletransformers.classification import ClassificationModel

In [None]:
%%capture
!pip install -U kaleido
!pip install plotly==4.11.0

In [None]:
import plotly.graph_objects as go
import plotly.express as px

In [None]:
import plotly
plotly.__version__

'4.11.0'

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
class Report:
  def __init__(self, sentences, pred_data, name, dirname):
    self.num_sentences = sentences
    self.pred_data = pred_data
    self.name = name
    self.dirname = dirname

    self.report = self._build_report()

  def _build_report(self):

    self.pred_data.to_csv(self.dirname + self.name + '.csv', index=False)

    with open(self.dirname + self.name + '.txt', encoding='utf-8', mode='w') as file:
      file.write('File name: {}\n'.format(self.name))
      file.write('Sentences: {}\n'.format(self.num_sentences))
      file.write('Spans: {}\n'.format(len(self.pred_data['code'])))
      file.write('Non Relevant Spans: {}\n'.format(len([span for span in self.pred_data['code'] if span in ('0', '000', '999', 0, 999)])))
      file.write('\n=== Response ===\n\n')
      
      file.write(self.pred_data.code.value_counts().to_string())
      # self.pred_data.apply(lambda row: file.write(str(row.code) + '\t' + str(row.span) + '\n'), axis=1)

In [None]:
class Visual:
  def __init__(self, data, man_labels, title, dirname):
    self.title = title
    self.dirname = dirname

    # self.data = data
    data.code = data.code.replace(0, 999)
    data.to_csv(dirname + 'results.csv', index=False)
    self.verbose_preds = data.astype(str).merge(man_labels.astype(str), on='code')
    self.verbose_preds['domain'] = self.verbose_preds.code.apply(lambda c: str(c)[0])

    self.domain_verbose = {
        0: 'General',
        1: 'External Relations',
        2: 'Freedom and Democracy',
        3: 'Political System',
        4: 'Economy',
        5: 'Welfare and Quality of Life',
        6: 'Fabric of Society',
        7: 'Social Groups',
        9: 'Non-Relevant Span'
    }

  def get_domains(self, topics):
    return [int(self.verbose_preds[self.verbose_preds.Topic == t].domain.unique()[0]) for t in topics]

  def get_codes(self, topics):
    return [int(self.verbose_preds[self.verbose_preds.Topic == t].topic.unique()[0]) for t in topics]

  def get_colors(self, topics):
    return [px.colors.qualitative.Plotly[dom] for dom in self.get_domains(topics)]

  def draw(self, show=False):
    y, x = np.unique(self.verbose_preds.Topic, return_counts=True)

    counts = np.unique(self.verbose_preds.Topic, return_counts=True)

    domain_indices = {}

    for domain in self.verbose_preds.domain.unique():
        domain_indices[int(domain)] = [i for i, dom in enumerate(self.get_domains(y)) if dom == int(domain)]

    fig = go.Figure()

    for domain, indices in domain_indices.items():

        x_d = [x[idx] for idx in indices]
        y_d = [y[idx] for idx in indices]

        fig.add_trace(
            go.Bar(
                x=x_d,
                y=y_d,
                marker_color=px.colors.qualitative.Vivid[domain],
                name=self.domain_verbose[domain],
                orientation='h'
            )
        )
        
    fig.update_layout(
        title= self.title,
        title_font_size=20,
        xaxis_title='Occurrences',
        xaxis_title_font_size=16,
        yaxis_title_font_size=16,
        yaxis_title='MAN Category',
        yaxis_tickmode='linear',
        legend=dict(
            # title='Domains:',
            # title_font_size=16,
            orientation="v",
            # yanchor="bottom",
            y=0.5,
            # xanchor="right",
            x=1
        ),
        autosize=False,
        width=1600,
        height=800,
    )

    fig.write_image(self.dirname + self.title + '.png') 
       
    if show:
      fig.show()
  

In [None]:
class PipelineModel:
  def __init__(self, test, ner_type, ner_dir, class_type, class_dir):

    print("Treating Input DataFrames")
    self.test_df, self.test_codes = self._treat_dataframe(test)

    self.id_type, self.class_type = ner_type, class_type
    self.id_dir, self.class_dir = ner_dir, class_dir

    self.class_map = {0: 0, 1: 101, 2: 103, 3: 104, 4: 106, 5: 107, 6: 108,
                      7: 201, 8: 202, 9: 203, 10: 301, 11: 302, 12: 303, 13: 304,
                      14: 305, 15: 401, 16: 402, 17: 403, 18: 404, 19: 405,
                      20: 406, 21: 408, 22: 409, 23: 410, 24: 411, 25: 412,
                      26: 413, 27: 414, 28: 415, 29: 416, 30: 501, 31: 502,
                      32: 503, 33: 504, 34: 506, 35: 601, 36: 603, 37: 605,
                      38: 606, 39: 607, 40: 701, 41: 703, 42: 704, 43: 705,
                      44: 706, 45: 999}

    print("Building Span Identification Model")
    self.id_model = self._build_span_model()

    # id_model outputs
    self.id_results, self.id_preds = None, None
    self.id_df = None

    # identified spans
    self.all_spans = None
    
    print("Building Span Classification Model")
    self.class_model = self._build_class_model()

    # class_model outputs
    self.pclass_results, self.aclass_results = None, None

    self.pmodel_outputs, self.amodel_outputs = None, None

    self.final_df = None

    # prediction
    self.pred_id, self.pred_class = None, None
    self.bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')


  def process(self):

    # Create necessary file in local directory
    os.makedirs(os.path.dirname("outputs/eval_results.txt"), exist_ok=True)
    with open("outputs/eval_results.txt", "w") as f:
        f.write("")

    # SPAN IDENTIFICATION
    print("SPAN IDENTIFICATION")
    self.id_result, _, self.id_preds = self.id_model.eval_model(self.test_df, verbose=True)
    self.id_df = self._build_id_df()
    self.all_spans = self._process_span_id()

    print('\nSpan Identification macro-averaged F-Score: {}\n'.format(self.f_macro(self.id_df['true'], self.id_df['preds'])))

    # SPAN CLASSIFICATION
    print("SPAN CLASSIFICATION\n")
    le = LabelEncoder().fit(self.test_codes)
    self.all_spans['labels'] = le.transform(self.all_spans['code'])

    df = self.all_spans.drop(columns=['sid', 'code'])
    self.aclass_results, self.amodel_outputs, _ = self.class_model.eval_model(df, acc=metrics.accuracy_score, f1M=self.f_macro)
    
    lbls = [np.argmax(self.softmax(logits)) for logits in self.amodel_outputs]
    self.all_spans['preds'] = [self.class_map[lbl] for lbl in lbls]

    self.final_df = self._build_final()
    self.final_df.to_csv('pipeline_results.csv', index=False)

  def predict(self, _dir):

    total_codes = {'article': [], 'span': [], 'code': []}

    total_sentences = 0

    article_list = glob.glob(_dir + '*.txt')

    author_name = _dir.split('/')[-2]

    directory = 'prediction_results/' + author_name + '/'

    if not os.path.exists(directory):
      os.makedirs(directory)

    man_labels = pd.read_csv('drive/My Drive/data-codes/final_man.csv', dtype='str', names=['Topic', 'code'], header=0).dropna()

    dataframes = []

    last_index = 0

    for idx, filepath in enumerate(article_list):

      with open(filepath, 'r', encoding='utf-8') as file:
        data = [sent for text in map(str.strip, file.readlines()) for sent in sent_tokenize(text) if text]

      df = self._tokenize_file(data)
      df.sentence_id += last_index
      last_index = df.sentence_id.max() + 1
      df['article'] = idx
      dataframes.append(df)
      
    Visual(self._predict(pd.concat(dataframes).reset_index(drop=True)), 
           man_labels, 
           author_name, 
           directory).draw()

  def _predict(self, pred_df):
    sentences = pred_df.groupby(by='sentence_id').words.apply(list).tolist()

    # article = pred_df.groupby(by='sentence_id').article.apply(lambda x: list(x)[0])

    pred_id, _ = self.id_model.predict(sentences, split_on_space=False)

    pred_spans, articles = self._process_pred_id(pred_id, pred_df.groupby(by='sentence_id').article.apply(lambda x: list(x)[0]))

    pred_class, _ = self.class_model.predict(pred_spans, multi_label=False)

    pred_class = [self.class_map[lbl] for lbl in pred_class]

    return pd.DataFrame({'span': pred_spans, 'code': pred_class, 'article': articles})


  def _tokenize_file(self, data):

    sentences = []

    for sentence in data:
        chunks = self._chunk_sentence(self.bert_tokenizer, sentence) if len(self.bert_tokenizer.encode(sentence)) > 200 else [sentence]
        sentences.extend(chunks)

    df_data = [[sentence_id, tokens] for sentence_id, sentence in enumerate(sentences) for tokens in word_tokenize(sentence)]

    assert len(df_data) == len(word_tokenize(" ".join(sentences)))

    return pd.DataFrame(df_data, columns=['sentence_id', 'words'])

  @staticmethod
  def _chunk_sentence(bert, sentence):
    """
    Perform a looping action where we will iteratively obtain appropriately sized spans from text that would otherwise
    exceed the 200 BERT tokens limit. We empirically set the initial maximum amount of words at 100 as it reduces the
    need to call the BERT tokenizer.
    :param bert: BERT tokenizer
    :param sentence: Text to be chunked (already found to be >200 BERT tokens long)
    :return: List[String] containing the original sentence chunked into smaller spans of text.
    """
    # @ symbol used to ensure sentence gets chunked at least once
    sentence_chunks = ["@"]

    split_sentence = sentence.split()
    max_length = 100

    # Loop until we used all of the input text
    while split_sentence:
        # Split according to max_length of words (default = 100 words)
        tmp_chunk = split_sentence[:max_length + 1]

        # Find commas in text
        commas = [idx for idx, split in enumerate(tmp_chunk) if split[-1] == ',']

        # Check if sentence must be chunked or should we try to process it whole
        if len(split_sentence) > 100 or len(sentence_chunks) == 1:
            # Find last comma index or get 75% of the text if no comma can found
            last_comma = commas[-1] if commas else int((len(tmp_chunk) - 1) * 0.75)
        else:
            last_comma = len(tmp_chunk) - 1

        # Chunk sentence according to last comma found
        tmp_chunk = " ".join(split_sentence[:last_comma + 1])

        # Chunk is of appropriate size
        if len(bert.encode(tmp_chunk)) < 200:
            # Add chunked text to the result
            sentence_chunks.append(tmp_chunk)
            # Remove processed text
            split_sentence = split_sentence[last_comma + 1:]
            # Reset max_length
            max_length = 100
        else:
            # Shorten max_length if chunk is still too big
            max_length = last_comma - 1

    # Do not send the @ symbol (first element in list)
    return sentence_chunks[1:]

  def _build_span_model(self):
    return NERModel(
                 self.id_type,
                 self.id_dir,
                 labels=['B', 'I', 'O'],
                 args={'train_batch_size': 32,
                        'eval_batch_size': 32,
                        'num_train_epochs': 2,
                        'max_seq_length': 200,
                        'save_steps': 0,
                        'evaluate_during_training': True,
                        'evaluate_during_training_steps': 0,
                        'evaluate_during_training_verbose': True,
                        'fp16': False,
                        'overwrite_output_dir': True,
                        'reprocess_input_data': True,
                        'learning_rate': 2e-5,
                        'manual_seed':42
                        }
                 )
    
  def _build_class_model(self):
    return ClassificationModel(
                self.class_type,
                self.class_dir,
                num_labels=46,
                args={'train_batch_size': 32, 
                      'eval_batch_size': 32,
                      'num_train_epochs': 4,
                      'max_seq_length': 200,
                      'save_steps': 0,
                      'evaluate_during_training': True,
                      'evaluate_during_training_steps': 4422,
                      'evaluate_during_training_verbose': True,
                      'fp16': False,
                      'overwrite_output_dir': True,
                      'reprocess_input_data': True,
                      'learning_rate': 2e-5,
                      'manual_seed':42
                      })
    
      
  def _build_id_df(self):
    test_to_preds = {"sid": [], "words": [], "true": [], "preds": []}

    for i, sid in self.test_df.groupby(by='sentence_id'):

      if sid.shape[0] != len(self.id_preds[i]):
        self.id_preds[i] += self.id_preds[i][-1] * (sid.shape[0] - len(self.id_preds[i]))

      test_to_preds["sid"].extend(list(sid.sentence_id.values))
      test_to_preds["words"].extend(list(sid.words.values))
      test_to_preds["true"].extend(list(sid.labels.values))
      test_to_preds["preds"].extend(self.id_preds[i])

    id_df = pd.DataFrame(test_to_preds)
    id_df['codes'] = self.test_codes
    id_df['country'] = self.test_df.country

    return id_df

  
  def _build_final(self):

    pred_codes = self.all_spans.preds.to_list()

    tot_spans = []
    span_id = 0

    for _, sent_df in self.id_df.groupby(by='sid'):

      spans = self.get_spans(sent_df.preds, sent_df.codes)

      for start, end in zip(spans, spans[1:]):

        df_list = sent_df[start:end].values.tolist()

        for sublist in df_list:

          sublist.append(pred_codes[span_id])

        tot_spans.extend(df_list)
        span_id += 1

    return pd.DataFrame(tot_spans, columns=['sid', 'words', 'true_id', 'pred_id', 'true_code', 'country', 'pred_code'])


  def _process_pred_id(self, pred_id, article_nums):
    spans, articles = [], []

    for idx, preds in enumerate(pred_id):

      tokens, labels = [], []

      for sent_lst in preds:
        for token, label in sent_lst.items():

          tokens.append(token)
          labels.append(label)

      span_limits = self.find_spans(labels)

      for start, end in zip(span_limits, span_limits[1:]):

        spans.append(" ".join(tokens[start:end]))
        articles.append(article_nums[idx])

    return spans, articles


  def _process_span_id(self):
    all_spans = []
    for sentence_id, sent_df in tqdm(self.id_df.groupby(by='sid')):
        sent_dict = sent_df.to_dict(orient='list')

        true_spans, pred_spans = [self.get_spans(sent_dict[bio], sent_dict["codes"]) for bio in ["true", "preds"]]

        for start, end in zip(pred_spans, pred_spans[1:]):
            all_spans.append([sentence_id,
                              " ".join(sent_dict["words"][start:end]),
                              sent_df[start:end].codes.value_counts().keys()[0],
                              sent_df.country.unique()[0]
                              ])
                
    return pd.DataFrame(all_spans, columns=['sid', 'text', 'code', 'country'])


  def _process_span_class(self, test, model_outputs):
      label_to_code = dict(zip(test.labels, test.code))

      domain_to_label = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7:[], 9:[]}

      for label, code in label_to_code.items():
        domain = code // 100
        domain_to_label[domain].append(label)

      true_domains = []

      for label in test.labels:
        for domain, labels in domain_to_label.items():
          if label in labels:
            true_domains.append(domain)

      preds_bert = [np.argmax(self.softmax(logits)) for logits in model_outputs]

      preds_domain_bert = []

      for label in preds_bert:
        for domain, labels in domain_to_label.items():
          if label in labels:
            preds_domain_bert.append(domain)

      return self.classification_metrics(true_domains, preds_domain_bert)

  
  def _treat_dataframe(self, df):
    le = LabelEncoder()
    sid = [str(id_) for id_ in df.sentence_id]
    df.sentence_id = le.fit_transform(sid)

    df, df_codes = self.drop_codes(df)
    df.words = df.words.apply(lambda x: str(x))
    return df, df_codes


  def classification_metrics(self, true, preds):
    return {
            "acc": metrics.accuracy_score(true, preds),
            "f_score": self.f_macro(true, preds),
            "mcc": metrics.matthews_corrcoef(true, preds)
            }


  @staticmethod
  def drop_codes(df):
    return df.drop(['codes'], axis=1), df.codes

  
  @staticmethod
  def get_spans(labels, codes):
    limits = []
    prev = ""
    for idx, (lbl, code) in enumerate(zip(labels, codes)):
        if lbl == 'B' or (prev != code and lbl == 'O'):
            limits.append(idx)
        prev = code

    limits.append(len(labels))

    if len(limits) <= 1:
      if limits[0] == 0:
        raise Exception
      else:
        limits = [0] + limits
    elif limits[0] != 0:
      limits = [0] + limits
    return limits


  @staticmethod
  def find_spans(labels):
    idx = []
    prev = ''
    for i, label in enumerate(labels):
      if label == 'B':
        idx.append(i)
      elif label == 'O' and prev != 'O':
        idx.append(i)
      prev = label
    idx.append(len(labels))
    return idx


  @staticmethod
  def f_macro(true, preds):
    return metrics.f1_score(true, preds, average='macro')


  @staticmethod
  def softmax(logits):
    return np.exp(logits) / np.sum(np.exp(logits), axis=0)

In [None]:
test = pd.read_csv('drive/My Drive/data-ner/test_ner.csv')

In [None]:
model = PipelineModel(test, 'xlmroberta', 'drive/My Drive/roberta-ner/', 'xlmroberta', 'drive/My Drive/roberta-smooth-0.9/')

Treating Input DataFrames
Building Span Identification Model
Building Span Classification Model



use_multiprocessing automatically disabled as xlmroberta fails when using multiprocessing for feature conversion.



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




In [None]:
# model.process()

In [None]:
author_directory = os.listdir('drive/My Drive/data-articles/')

In [None]:
author_directory[:-2]

['Rui Ramos',
 'Alexandre Homem Cristo',
 'Rui Tavares',
 'Mariana Mortágua',
 'Fernanda Câncio']

In [None]:

for author in tqdm(author_directory[:-2]):
  model.predict('drive/My Drive/data-articles/' + author + '/')

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9589.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=300.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=9643.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=302.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5601.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=176.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, max=5647.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=177.0), HTML(value='')))





In [None]:
!zip -r ./prediction_results.zip ./prediction_results/

  adding: prediction_results/ (stored 0%)
  adding: prediction_results/Alexandre Homem Cristo/ (stored 0%)
  adding: prediction_results/Alexandre Homem Cristo/results.csv (deflated 65%)
  adding: prediction_results/Alexandre Homem Cristo/Alexandre Homem Cristo.png (deflated 12%)
  adding: prediction_results/Rui Tavares/ (stored 0%)
  adding: prediction_results/Rui Tavares/Rui Tavares.png (deflated 11%)
  adding: prediction_results/Rui Tavares/results.csv (deflated 64%)
  adding: prediction_results/Fernanda Câncio/ (stored 0%)
  adding: prediction_results/Fernanda Câncio/Fernanda Câncio.png (deflated 12%)
  adding: prediction_results/Fernanda Câncio/results.csv (deflated 61%)
  adding: prediction_results/Rui Ramos/ (stored 0%)
  adding: prediction_results/Rui Ramos/results.csv (deflated 65%)
  adding: prediction_results/Rui Ramos/Rui Ramos.png (deflated 13%)
  adding: prediction_results/test2/ (stored 0%)
  adding: prediction_results/test2/test2.png (deflated 27%)
  adding: predicti