In [1]:
import pandas as pd 
import spacy
from spacy.tokens import Doc, Token, DocBin
from spacy.vocab import Vocab
import numpy as np
import re
import time
from spacy.lang.en import English
from spellchecker import SpellChecker
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import pickle

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp.Defaults.stop_words = {"a", "this", "that", "it" "its", "and", "be", "to", "just", "the", "on", "but", "in", "for"}

In [2]:
def reduce_elongated(text): 
    return re.sub(r'(?i)(.)\1{2,}', r'\1\1', text)

spell = SpellChecker()

In [3]:
fields = ['comment_text', 'toxicity','target', 'id']
df_train = pd.read_csv("data/raw_split/train_custom.csv", usecols = fields)
df_train.dropna(inplace = True)
#df_train = df_train[:100]
df_valid = pd.read_csv("data/raw_split/valid_custom.csv", usecols = fields)
df_valid.dropna(inplace = True)
#df_valid = df_valid[:100]
df_test = pd.read_csv("data/raw_split/test_custom.csv", usecols = fields)
df_test.dropna(inplace = True)

In [6]:
#dfs = [df_train, df_valid, df_test]
#df_names = ["train", "valid", "test"]
dfs = [df_valid]
df_names = ["valid"]
suffix = "lemma_nopunct_cleaned_sentencized_with_id"
path = "data/pre_processed/"

for i, df in enumerate(dfs):
    print("\nprocessing", df_names[i])
    t1 = time.time()
    processed_comments = []
    df.dropna(subset = ['comment_text'], inplace = True)
    for text in df['comment_text']:
        text = re.sub(r'!?[-()\"#/@;:<>{}=~|,]'," ", text)
        text = re.sub(r"\*\n\r","", text)
        combine_whitespaces = re.compile(r"\s+")
        processed_comments.append(reduce_elongated(combine_whitespaces.sub(" ", text).strip()))

    pipe = nlp.pipe(processed_comments, batch_size = 512 ,disable = ["ner", "tagger", "parser"])
    results = []
    for j, doc in enumerate(pipe):
        s = []
        if j % 10000 == 0:
            print("{0:.0%}...".format(j/len(dfs[i])), end='')
        for sent in list(doc.sents):
            s.extend([token.lemma_ for token in sent \
                      if not token.is_digit\
                      and not token.text == "." and not token.like_url and not token.like_email \
                      and not token.like_num and not token.is_punct])
            
            #---For stopword removal and person name masking
             #and not token.lemma_ in nlp.Defaults.stop_words \
             #("-PERSON-" if token.ent_type_ == "PERSON" else token.lemma_)
            s.extend(['.'])            
        results.append(" ".join(s))
    
   
    t2 = time.time()
    print(t2- t1)

    df['comment_text'] = results
    print(df_names[i])
    df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


processing valid
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%...42%...44%...47%...50%...53%...55%...58%...61%...64%...66%...69%...72%...75%...78%...80%...83%...86%...89%...91%...94%...97%...100%...740.7140009403229
valid


In [3]:
#cell for testing things

import editdistance

editdistance.eval('idiotism', 'idiots')

pipe = nlp.pipe(["Sara Fredericks was rather mean"], batch_size = 512 ,disable = ["tagger", "parser"])
for j, doc in enumerate(pipe):
    print("NEs: " + ", ".join([("-PERSON-" if token.ent_type_ == "PERSON" else token.lemma_) for token in doc]))
    
from nltk.sentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
sentiments = sa.polarity_scores("Donald Trump")
print(sentiments)

print(word_tokenize("b*tch"))

NEs: -PERSON-, -PERSON-, be, rather, mean
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
['you', 'b', '*', 'tch']
