In [1]:
from typing import List, Union, Any, Dict
from abc import ABC, abstractmethod

from tqdm import tqdm
from pprint import pprint

import numpy as np
import pandas as pd

import re
import string
import unicodedata

import torch
from datasets import Dataset
from transformers import pipeline

BATCH_SIZE = 128
MODEL_CARD = 'valhalla/distilbart-mnli-12-1'
PIPELINE = pipeline(
    'zero-shot-classification', MODEL_CARD,
    device=(0 if torch.cuda.is_available() else -1)
)


def batched_prediction(batch, candidate_labels: List[str], col_text: str = 'text'):
    
    out = PIPELINE(batch[col_text], candidate_labels=candidate_labels)
    ret = {'predictions': out}
    return ret


def clean_str(s: str) -> str:
    """String pre-processing function, used to reduce noise.
        1. Convert all characters to ASCII
        2. Remove other irrelevant stuff like email address or external url
        3. Remove special symbols like newline character \\n"""
        
    # Normalize special chars
    s = str(s)
    s = (unicodedata.normalize('NFKD', s)
            .encode('ascii', 'ignore').decode())

    # Remove irrelevant info
    s = re.sub(r'\S*@\S*\s?', '', s)     # Email
    s = re.sub(r'\S*https?:\S*', '', s)  # URL (http)
    s = re.sub(r'\S*www\.\S*', '', s)    # URL (www)
    
    # Keep punctuation and words only
    pattern_keep = (string.punctuation + 
                        string.ascii_letters + 
                        string.digits + 
                        r' ')
    return re.sub(r'[^' + pattern_keep + r']+', '', s)

In [2]:
# Load occupation categories
df_occ = pd.read_csv('../data/categories.csv')
df_occ.loc[:, 'occupation'] = df_occ.loc[:, 'occupation'].str.lower()

# Load textual descriptions of interested entities
df_ent = pd.read_csv('../data/Search_Region_NI.csv')
df_ent = df_ent.loc[~df_ent.loc[:, 'org_flag']]
df_ent.loc[:, 'text'] = df_ent.loc[:, 'description1'].map(clean_str)

# Convert from Pandas to Huggingface dataset and predict occupations
df_ent = (Dataset
    .from_pandas(df_ent)
    .map(
        batched_prediction, 
        batched=True,
        batch_size=BATCH_SIZE,
        fn_kwargs={'candidate_labels': df_occ.occupation.unique().tolist()}
    )
    .to_pandas())

  0%|          | 0/8 [00:00<?, ?ba/s]

In [3]:
occ_keep = {
    'politician',
    'businessperson',
    'journalist',
    'social activist',
    'extremist',
    'judge',
    'lawyer',
    'economist',
    'critic',
    'military personnel'
}

df_ent.loc[:, 'top1_label'] = df_ent.predictions.map(lambda d: d['labels'][0])
df_ent.loc[:, 'top1_score'] = df_ent.predictions.map(lambda d: d['scores'][0])
df_ent.loc[:, 'is_kept'] = df_ent.top1_label.map(lambda o: int(o in occ_keep))
df_ent.to_csv('../out/Search_Region_NI_pred_occ_bart.csv', index=False)

In [4]:
# Analysis of extremist predictions
# df_ext = pd.read_csv('../out/SF_all_tone_2k_entities_pred_occ_bart.csv')
# df_ext = df_ext.loc[df_ext.occ_pred == 'extremist', ['entity', 'description1', 'description2']].reset_index(drop=True)