In [8]:
from typing import List, Union, Any, Dict
from abc import ABC, abstractmethod
from pprint import pprint

import numpy as np
import pandas as pd

import re
import string
import unicodedata

import torch
from transformers import pipeline


def clean_str(s: str) -> str:
    """String pre-processing function, used to reduce noise.
        1. Convert all characters to ASCII
        2. Remove other irrelevant stuff like email address or external url
        3. Remove special symbols like newline character \\n"""
        
    # Normalize special chars
    s = str(s)
    # s = (unicodedata.normalize('NFKD', s)
    #         .encode('ascii', 'ignore').decode())

    # Remove irrelevant info
    s = re.sub(r'\S*@\S*\s?', '', s)     # Email
    s = re.sub(r'\S*https?:\S*', '', s)  # URL (http)
    s = re.sub(r'\S*www\.\S*', '', s)    # URL (www)
    
    # Keep punctuation and words only
    pattern_keep = (string.punctuation + 
                        string.ascii_letters + 
                        string.digits + 
                        r' ')
    return re.sub(r'[^' + pattern_keep + r']+', '', s)

In [2]:
# Load textual descriptions of interested entities
df_ent = pd.read_csv('../data/SF_all_tone_2k_entities.csv')
df_ent = df_ent.loc[~df_ent.loc[:, 'org_flag']]
df_ent.loc[:, 'description1'] = df_ent.loc[:, 'description1'].map(clean_str)
df_ent.description1.iloc[0]

# Load occupation categories
df_occ = pd.read_csv('../data/categories.csv')
df_occ.loc[:, 'occupation'] = df_occ.loc[:, 'occupation'].str.lower()

In [3]:
# Predict occupation
model = pipeline('zero-shot-classification', 'valhalla/distilbart-mnli-12-1', device=0)
categories = df_occ.occupation.unique().tolist()
def predict_occupation(row: pd.Series) -> str:
    
    ent = row['entity']
    desc = row['description1']
    ret = model(desc, categories)['labels'][0]
    return ret

In [4]:
occ_keep = {
    'politician',
    'businessperson',
    'journalist',
    'social activist',
    'extremist',
    'judge',
    'lawyer',
    'economist',
    'critic',
    'military person'
}

df_ent.loc[:, 'occ_pred'] = df_ent.apply(predict_occupation, axis=1)
df_ent.loc[:, 'is_kept'] = df_ent.occ_pred.map(lambda o: int(o in occ_keep))
df_ent.to_csv('../out/SF_all_tone_2k_entities_pred_occ_bart.csv', index=False)



In [9]:
# Analysis of extremist predictions
df_ext = pd.read_csv('../out/SF_all_tone_2k_entities_pred_occ_bart.csv')
df_ext = df_ext.loc[df_ext.occ_pred == 'extremist', ['entity', 'description1', 'description2']].reset_index(drop=True)

In [6]:
row = df_ext.iloc[0]
ret = model(row['description2'], categories)
pprint(ret)

In [15]:
df_ext.loc[:, ['entity', 'description1']]

Unnamed: 0,entity,description1
0,Van Breda,The Van Breda murders were the killing of thre...
1,Steve Biko,Bantu Stephen Biko (18 December 1946 12 Septe...
2,Nino Mbatha,"Dec 14, 2018 ... CNNNino Mbatha (wearing camou..."
