In [4]:
import pandas as pd 
import spacy
from spacy.tokens import Doc, Token, DocBin
from spacy.vocab import Vocab
import numpy as np
import re
import time
from spacy.lang.en import English
from spellchecker import SpellChecker
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import pickle
from nltk.corpus import wordnet 

nlp = spacy.load("en_core_web_sm")

In [2]:
def reduce_elongated(text): 
    return re.sub(r'(?i)(.)\1{2,}', r'\1\1', text)

spell = SpellChecker()

In [3]:
fields = ['comment_text', 'toxicity','target']
df_train = pd.read_csv("data/train_custom.csv", usecols = fields)
df_train = df_train[:int(len(df_train) * 1/4)]
df_valid = pd.read_csv("data/valid_custom.csv", usecols = fields)
df_valid = df_valid[:int(len(df_valid) * 1/4)]
#df_test = pd.read_csv("data/test_custom.csv", usecols = fields)

In [4]:
#dfs = [df_train, df_valid, df_test]
dfs = [df_train, df_valid]
#df_names = ["train", "valid", "test"]
df_names = ["train", "valid"]
suffix = "lemma_nopunct_spellcorrect"
path = "data/pre_processed/subsets/25_percent/"

for i, df in enumerate(dfs):
    processed_comments = []
    dict_misspell = {}
    fdist = FreqDist()
    
    t1 = time.time()
    print("\nprocessing", df_names[i])
    df.dropna(subset = ['comment_text'], inplace = True)
    for text in df['comment_text']:
        text = re.sub(r'[-()\"#/@;:<>{}=~|,]'," ", text)
        text = re.sub(r"\n\r","", text)
        combine_whitespaces = re.compile(r"\s+")
        processed_comments.append(reduce_elongated(combine_whitespaces.sub(" ", text).strip()))

    pipe = nlp.pipe(processed_comments, batch_size = 512 ,disable = ["tagger", "parser"])
    for j, doc in enumerate(pipe):
        if j % 10000 == 0:
            print("{0:.0%}...".format(j/len(dfs[i])), end='')
        
        tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_digit\
                                  and not token.like_url and not token.like_email and not token.like_num]
        processed_comments.append(" ".join(tokens))
        fdist += FreqDist(tokens)
        
        #remember which comments contain missspellings to avoid having to look at every comment later
        misspelled = spell.unknown(tokens)
        if (len(misspelled) > 0):
            for misspelled_word in misspelled:
                if misspelled_word in dict_misspell.keys():
                    dict_misspell[misspelled_word].append(j)
                else:
                    dict_misspell[misspelled_word] = [j]
        
    t2 = time.time()
    print(t2- t1)
    t1 = time.time()
    
    #spell correction is done after the rest because only then is the frequency dict fully build.
    pickle.dump(fdist, open("dicts/freq_dict_lower_" + df_names[i] + ".p", "wb"))
    pickle.dump(dict_misspell, open("dicts/misspell_dict_lower_" + df_names[i] + ".p", "wb"))
    print("\nspellfixing", df_names[i])
    for k, mis in enumerate(dict_misspell.keys()):
        if k % 100 == 0:
            print("{0:.0%}...".format(k/len(dict_misspell.keys())), end='')
        if(fdist[mis] <= 10):  #only fix below certain word frequency to avoid false positives
            missspelling_idxs = dict_misspell[mis]
            correct_spelling = spell.correction(mis)
            for idx in missspelling_idxs:
                processed_comments[idx] = processed_comments[idx].replace(mis, correct_spelling)
    
    df['comment_text'] = processed_comments
    df.to_csv(path + df_names[i]  + "_" + suffix + "_thresh_10.csv", index = False)
    
    t2 = time.time()
    print(t2- t1)


processing train
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%...42%...44%...47%...50%...53%...55%...58%...61%...64%...66%...69%...72%...75%...78%...80%...83%...86%...89%...91%...94%...97%...100%...4289.135826826096

spellfixing train
0%...0%...0%...0%...0%...0%...1%...1%...1%...1%...1%...1%...1%...1%...1%...1%...1%...1%...2%...2%...2%...2%...2%...2%...2%...2%...2%...2%...2%...2%...3%...3%...3%...3%...3%...3%...3%...3%...3%...3%...3%...3%...4%...4%...4%...4%...4%...4%...4%...4%...4%...4%...4%...4%...5%...5%...5%...5%...5%...5%...5%...5%...5%...5%...5%...5%...6%...6%...6%...6%...6%...6%...6%...6%...6%...6%...6%...6%...7%...7%...7%...7%...7%...7%...7%...7%...7%...7%...7%...7%...8%...8%...8%...8%...8%...8%...8%...8%...8%...8%...8%...8%...9%...9%...9%...9%...9%...9%...9%...9%...9%...9%...9%...9%...10%...10%...10%...10%...10%...10%...10%...10%...10%...10%...10%...10%...11%...11%...11%...11%...11%...11%...11%...11%...11%...11%...11%...11%...12%...12%...