# Identify occurrences of *hen* in the dataset

In [1]:
# Imports
import nltk
import pandas as pd
from tqdm import tqdm
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# preprocessing without removing stop words
import re
import string
def preprocess(text:str, to_string:bool=True):
    """Preprocesses data by lowercasing, removing punctuation. Can be returned as string (to_string=True) or list of tokens.

    Args:
        text (str): the text to be preprocessed
        to_string (bool, optional): whether to return preprocessed text as string. Defaults to True.

    Returns:
        str or list: string or list of preprocessed text
    """
    # tokenize text using dacy
    tokens = []
    re_punctuation = re.compile('[%s]' % re.escape(string.punctuation))

    for token in nltk.word_tokenize(text):

        word = token.lower() # lowercase
        word = re_punctuation.sub('', word) # remove punctuation
        word = re.sub(r"[\d]", "", word) # remove digits
        if len(word) > 0: # avoids things like "_PUNCT" after the punctuation has been removed
            tokens.append(word)
    
    if to_string:
        tokens = " ".join(tokens)
        tokens = re.sub("\s\s+" , " ", tokens) # handle multiple spaces
    
    return tokens

In [5]:
X_train_orig, X_test_orig, _, _ = utils.load_dkhate(test_size=0.2)
X_train_orig = pd.DataFrame(X_train_orig)
X_train_orig["text"] = X_train_orig["tweet"].apply(lambda x: preprocess(x))
X_test_orig = pd.DataFrame(X_test_orig)
X_test_orig["text"] = X_test_orig["tweet"].apply(lambda x: preprocess(x))
X_train_orig.head()

Unnamed: 0_level_0,tweet,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3176,Hahaha,hahaha
1440,@USER hvis du føler du har det svært så prøv a...,user hvis du føler du har det svært så prøv at...
3501,Det er endnu en barriere for bønder uden for E...,det er endnu en barriere for bønder uden for eu
3016,Mit eneste møde ved ham var på min snuskede st...,mit eneste møde ved ham var på min snuskede st...
2399,Forøvrigt taget fra et godt dokumentarprogram ...,forøvrigt taget fra et godt dokumentarprogram ...


In [6]:
# identify and save occurrences 
c = 0
with open("hen_occurrences_train.txt", "w") as f:
    for x in X_train_orig["text"]:
        for token in nltk.word_tokenize(x):
            if "hen" == token:
                c += 1
                f.write(x+"\n\n")
        
    print(c, "OCCURRENCES OF HEN IN TRAINING DATA")

c = 0
with open("hen_occurrences_test.txt", "w") as f:
    for x in X_test_orig["text"]:
        for token in nltk.word_tokenize(x):
            if "hen" == token:
                c += 1
                f.write(x+"\n\n")
        
    print(c, "OCCURRENCES OF HEN IN TEST DATA")

8 OCCURRENCES OF HEN IN TRAINING DATA
2 OCCURRENCES OF HEN IN TEST DATA
