In [95]:
import os
import sys
import csv
import re
import numpy as np

HL_VERSION = "1.2"
categories = ["ps", "pa", "ddf", "ddp", "asf", "pr", "om", "qas"]
LEN = len(categories)

In [96]:
class HurtLexFeaturizer:
    def __init__(self, language):
        self.language = language
        self.conservative_lexicon = self.read_lexicon('conservative', language)
        self.inclusive_lexicon = self.read_lexicon('inclusive', language)

    def read_lexicon(self, level, language):
        lexicon = dict()
        lexicon_filename = "hurtlex_{0}.tsv".format(self.language)
        lexicon_path = os.path.join("lexica", language, HL_VERSION, lexicon_filename)
        with open(lexicon_path) as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                if row["level"]!=level:
                    continue
                if not row["lemma"] in lexicon:
                    lexicon[row["lemma"]] = np.zeros(2*LEN)
                if row["category"] in categories:
                    if level == "inclusive":
                        lexicon[row["lemma"]][LEN + categories.index(row["category"])] += 1
                    else:
                        lexicon[row["lemma"]][categories.index(row["category"])] += 1
        return lexicon

    def check_presence(self,lexicon, text):
        final_features = np.zeros(2*LEN)
        for k,v in lexicon.items():
            string = r"\b" + k+ r"\b"
#             print(k,string, text)
            all_matches = re.findall(string, text)
            for match in all_matches:
                final_features = np.add(final_features, lexicon[match])
        return final_features
    
    def process(self, text):
        return np.add(self.check_presence(self.conservative_lexicon, text), self.check_presence(self.inclusive_lexicon, text))

In [97]:
enHurtlex = HurtLexFeaturizer('EN')
esHurtlex = HurtLexFeaturizer('ES')    

In [103]:
def hurtlex_features(text, language):
    text = text.lower()
    text_len = len(text)
    if language == 'en':
        return enHurtlex.process(text)/text_len
    elif language == 'es':
        return esHurtlex.process(text)/text_len

In [111]:
sample_tweet_en = "She calls herself ""anti-feminazi"" how about shut the fucking up on your vile commentary on an elderly responsible citizen tu sach muuch ghani baawri-bewdi hai bey https://t.co/ZMxTDwsY5D"

In [112]:
hurtlex_features(sample_tweet_en, 'en')

array([0.        , 0.        , 0.00543478, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [113]:
sample_tweet_es = "@wakanda____ Interesante, la vieja mugre pirulina evita confrontar a otra lagartona mujer, pero con hombres se pone agresiva pirulina, será porque la ley de feminicidio que se siente huevuda con hombres y sabe nos vamos al bote fácilmente por tronarle la trompa por abusiva "

In [114]:
hurtlex_features(sample_tweet_es, 'es')

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00364964, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])