In [2]:
import pandas as pd
import numpy as np
import os
import torch 
import torch.nn.functional as F

from config import DATA_FOLDER, DATA_PCL_NAME, DATA_CATEGORIES_NAME
from utils import Utils
from transformers import BertTokenizer

df = pd.read_csv(os.path.join(
            os.path.dirname(os.getcwd()),
            DATA_FOLDER,
            DATA_PCL_NAME
        ))

df = df.dropna()
df

Unnamed: 0,par_id,art_id,keyword,country_code,text,label,binary_label
0,1,24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0.0,0.0
1,2,21968160,migrant,gh,"In Libya today , there are countless number of...",0.0,0.0
2,3,16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0.0,0.0
3,4,7811231,disabled,nz,Council customers only signs would be displaye...,0.0,0.0
4,5,1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0.0,0.0
...,...,...,...,...,...,...,...
10463,10464,19612634,disabled,ie,"""When Marie O'Donoghue went looking for a spec...",0.0,0.0
10464,10465,14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",1.0,0.0
10465,10466,70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0.0,0.0
10466,10467,20282330,in-need,ng,""""""" She has one huge platform , and informatio...",3.0,1.0


In [2]:
print("hell\tohe")

hell	ohe


# Preprocessing 

### Cleaning "keyword" and "country_code" columns

In [4]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

# Stem keyword
df["keyword"] = df["keyword"].apply(lambda word: porter.stem(word))
df["keyword"].unique()

array(['hopeless', 'migrant', 'immigr', 'disabl', 'refuge', 'in-ne',
       'homeless', 'vulner', 'women', 'poor-famili'], dtype=object)

In [3]:
# Convert countries to categtorical label value 
df["country_code"] = pd.Categorical(df["country_code"], categories=df["country_code"].unique()).codes
df["country_code"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int8)

### Preprocessing

1. White-space normalisation
2. Replace common words (a, the, and, or, and, ...) with a token
3. White-space normalisation + Normalisation of punctuation (Replacing the repetitions of punctations) + Replace common words (a, the, and, or, and, ...) with a token
4. White-space normalisation + Normalisation of punctuation + Removing Numbers 
5. White-space normalisation + Normalisation of punctuation + Removing Numbers + Replace common words (a, the, and, or, and, ...) with a token
6. White-space normalisation + Normalisation of punctuation +  Removing Contractions
7. White-space normalisation + Normalisation of punctuation +  Removing Contractions + Replace common words (a, the, and, or, and, ...) with a token

### Cleaning text

- normalisation of punctuation
- white-space normalisation
- Removing Numbers
- Removing Contractions
- Replacing the repetitions of punctations

In [5]:
# pip install autocorrect
from autocorrect import Speller
# pip install contractions
import contractions
import re 

spell = Speller(lang='en')

def autospell(text):
        spells = [spell(w) for w in text.split()]
        return " ".join(spells) 

def clean_text(text):
    text = text.strip("\"")                                                     # removing " at start of sentences
    # removing links
    text = re.sub(r'https? : \S+', '[WEBSITE]', text)
    text = re.sub(r'@\S+', '', text)                                            # removing referencing on usernames with @
    text = re.sub(r':\S+', '', text)                                            # removing smileys with : (like :),:D,:( etc) 
    text = re.sub(r'\"+', '', text)                                             # replacing repetitions of punctations
    text = re.sub(r'(\W)(?=\1)', '', text)                                      # replacing repetitions of punctations  
    text = re.sub('(?<![\w])20[0-5][0-9]-?[0-9]*', '[YEAR]', text)              # Year token
    text = re.sub('(?<![\w])1[0-9]{3}-?[0-9]*', '[YEAR]', text)                 # Year token
    text = re.sub('ca n\'t', 'can not', text)                                   # Many ocurrences of this (87)
    # replacing numbers with [NUM] tag  eg 1,000, 1.32, 5-7. Assert these numbers are not inside words (i.e. H1, )
    text = re.sub('(?<![\w])[0-9]+[.,]?[0-9]*(?![\w])', '[NUM]', text)   
    text = re.sub('\[NUM\]-\[NUM\]', '[NUM]', text)
    # Again to delete account numbers lol 12-5223-231
    text = re.sub('\[NUM\]-\[NUM\]', '[NUM]', text)
    text = re.sub('(?<=\[NUM\])-(?=[a-zA-Z])', ' ', text)
    text = re.sub('<h>', '', text)

    text_split = text.strip().split()
    text_new = []
    for i, word in enumerate(text_split):
        if i < len(text_split)-1:
            if text_split[i+1][0] == "'":
                new_word = word + text_split[i+1]
            else:
                new_word = word
            text_new.append(new_word) 
        else:
            text_new.append(word) 
    
    text = contractions.fix(" ".join([word for word in text_new if word[0] != "'"]))
    text = " ".join([word.strip("'") for word in text.split()]).strip()
    #text = autospell(text)

    # Remove possesive 's from end of words (not fixed by contractions). 
    # e.g. Oprah Winfrey's
    text = re.sub(r'\'s', '', text)
    # Many nots remain contracted
    text = re.sub(r'n\'t', 'not', text)
    
    return text

     

# extending the dataset with the column clean_text
df["clean_text"] = df['text'].apply(lambda x: clean_text(x))            

In [6]:
df[["text","clean_text"]].to_csv("check_clean.csv")

In [7]:
contractions.fix("'he told me to come' to the party' to the Mary's getting maried you're")

"'he told me to come' to the party' to the Mary's getting maried you are"

### Tokenization with pretrained model

In [8]:
a = [12,3,2,4]
a[:34]

[12, 3, 2, 4]

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Model parameters
MAX_SEQ_LEN = 512
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

def data_process(clean_text):
  # Enocode the text
  tokens = tokenizer.tokenize(clean_text)[:MAX_SEQ_LEN]
  #encode the tokens 
  encoded_sentence = torch.tensor(tokenizer.encode(tokens))
  #Pad the text output 
  encoded_sentence = F.pad(encoded_sentence, (0, MAX_SEQ_LEN - len(encoded_sentence)), value=PAD_INDEX)
  return encoded_sentence

# extending the dataset with the column clean_text
df["training_text"] = df['clean_text'].apply(lambda x: data_process(x))     

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


In [10]:
print( tokenizer.convert_ids_to_tokens(df[df["par_id"]==1]["training_text"][0]))

['[CLS]', 'we', 'are', 'living', 'in', 'times', 'of', 'absolute', 'insanity', ',', 'as', 'i', 'am', 'pretty', 'sure', 'most', 'people', 'are', 'aware', '.', 'for', 'a', 'while', ',', 'waking', 'up', 'every', 'day', 'to', 'check', 'the', 'news', 'seemed', 'to', 'carry', 'with', 'it', 'the', 'same', 'feeling', 'of', 'panic', 'and', 'dread', 'that', 'action', 'heroes', 'probably', 'face', 'when', 'they', 'are', 'trying', 'to', 'decide', 'whether', 'to', 'cut', 'the', 'blue', 'or', 'green', 'wire', 'on', 'a', 'ticking', 'bomb', '-', 'except', 'the', 'bomb', 'instructions', 'long', 'ago', 'burned', 'in', 'a', 'fire', 'and', 'imminent', 'catastrophe', 'seems', 'the', 'like', '##liest', 'outcome', '.', 'it', 'is', 'hard', 'to', 'stay', 'that', 'on', '-', 'edge', 'for', 'that', 'long', ',', 'though', ',', 'so', 'it', 'is', 'natural', 'for', 'people', 'to', 'become', 'in', '##ured', 'to', 'this', 'constant', 'chaos', ',', 'to', 'slump', 'into', 'a', 'mala', '##ise', 'of', 'hopeless', '##ness', 

In [33]:
test = tokenizer(df['clean_text'].to_list(), truncation=True, max_length=512, padding=True)

In [34]:
test.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [35]:
tokenizer.convert_ids_to_tokens(test['input_ids'][0])

['[CLS]',
 'we',
 'are',
 'living',
 'in',
 'times',
 'of',
 'absolute',
 'insanity',
 ',',
 'as',
 'i',
 'am',
 'pretty',
 'sure',
 'most',
 'people',
 'are',
 'aware',
 '.',
 'for',
 'a',
 'while',
 ',',
 'waking',
 'up',
 'every',
 'day',
 'to',
 'check',
 'the',
 'news',
 'seemed',
 'to',
 'carry',
 'with',
 'it',
 'the',
 'same',
 'feeling',
 'of',
 'panic',
 'and',
 'dread',
 'that',
 'action',
 'heroes',
 'probably',
 'face',
 'when',
 'they',
 'are',
 'trying',
 'to',
 'decide',
 'whether',
 'to',
 'cut',
 'the',
 'blue',
 'or',
 'green',
 'wire',
 'on',
 'a',
 'ticking',
 'bomb',
 '-',
 'except',
 'the',
 'bomb',
 'instructions',
 'long',
 'ago',
 'burned',
 'in',
 'a',
 'fire',
 'and',
 'imminent',
 'catastrophe',
 'seems',
 'the',
 'like',
 '##liest',
 'outcome',
 '.',
 'it',
 'is',
 'hard',
 'to',
 'stay',
 'that',
 'on',
 '-',
 'edge',
 'for',
 'that',
 'long',
 ',',
 'though',
 ',',
 'so',
 'it',
 'is',
 'natural',
 'for',
 'people',
 'to',
 'become',
 'in',
 '##ured',
 '

In [36]:
len(test['input_ids'][0])

512

In [37]:
len(test['attention_mask'][0])

512