In [1]:
import pandas as pd 
import spacy
from spacy.tokens import Doc, Token, DocBin
from spacy.vocab import Vocab
import numpy as np
import re
import time
from spacy.lang.en import English
from spellchecker import SpellChecker
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import pickle

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [2]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'<LINK>', text)

def reduce_elongated(text): 
    return re.sub(r'(?i)(.)\1{2,}', r'\1\1', text)

spell = SpellChecker()

In [6]:
fields = ['comment_text', 'toxicity','target']
df_train = pd.read_csv("data/train_custom.csv", usecols = fields)
df_train = df_train[:int(len(df_train) * 1/4)]
#df_train = df_train[:10]
df_valid = pd.read_csv("data/valid_custom.csv", usecols = fields)
df_valid = df_valid[:int(len(df_valid) * 1/4)]
#df_valid = df_valid[:10]
#df_test = pd.read_csv("data/test_custom.csv", usecols = fields)

In [7]:
#dfs = [df_train, df_valid, df_test]
dfs = [df_train, df_valid]
#df_names = ["train", "valid", "test"]
df_names = ["train", "valid"]
suffix = "lemma_nopunct_cleaned_sentencized"
path = "data/pre_processed/subsets/25_percent/"

for i, df in enumerate(dfs):
    print("\nprocessing", df_names[i])
    t1 = time.time()
    processed_comments = []
    df.dropna(subset = ['comment_text'], inplace = True)
    for text in df['comment_text']:
        text = re.sub(r'[-()\"#/@;:<>{}=~|,]'," ", text)
        text = re.sub(r"\n\r","", text)
        combine_whitespaces = re.compile(r"\s+")
        processed_comments.append(reduce_elongated(combine_whitespaces.sub(" ", text).strip()))

    pipe = nlp.pipe(processed_comments, batch_size = 512 ,disable = ["tagger", "parser"])
    results = []
    for j, doc in enumerate(pipe):
        s = []
        if j % 10000 == 0:
            print("{0:.0%}...".format(j/len(dfs[i])), end='')
        for sent in list(doc.sents):
            s.extend([token.lemma_.lower() for token in sent if not token.is_punct and not token.is_digit\
                                  and not token.like_url and not token.like_email and not token.like_num])
            s.extend(['.'])            
        results.append(" ".join(s))
    
   
    t2 = time.time()
    print(t2- t1)

    df['comment_text'] = results
    print(df_names[i])
    df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


processing train
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%...42%...44%...47%...50%...53%...55%...58%...61%...64%...66%...69%...72%...75%...78%...80%...83%...86%...89%...91%...94%...97%...100%...782.5173802375793
train

processing valid
0%...11%...22%...33%...44%...55%...66%...78%...89%...100%...212.74158692359924
valid


['Yana stinkt sehr stark ..']
