In [1]:
#Import needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import gc
import os
import pickle 
import mglearn
import warnings
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import enchant
from collections import Counter
from nltk import pos_tag
d = enchant.Dict("en_US")
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import multiprocessing
from gensim.models import Word2Vec
from collections import defaultdict  # For word frequency
import spacy  # For preprocessing
import logging  # Setting up the loggings to monitor gensim
from gensim.models.phrases import Phrases, Phraser
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
from time import time  # To time our operations

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
SEED=13
DIMS=300
FEAT=3000
#python -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
#File locations

root_dir = os.path.abspath(os.curdir)
base_dir = os.path.dirname(root_dir)+"/Data/"

#Create directory to save features
feat_dir = os.path.dirname(root_dir)+"/Features/"
os.makedirs(feat_dir, exist_ok=True)
bad = base_dir+'en'
training_dataset = base_dir+'trainingDataset-toxicComments.csv'
#For discourse we will use this lexicon
#https://github.com/discourse-lab/en_dimlex
discourse = base_dir+'en_dimlex-parsed.txt'
#Positive words are taken from the following resource 
#https://github.com/zengyan-97/Sentiment-Lexicon
positive = base_dir+'positive.txt'
negative = base_dir+'negative.txt'
disList = [x.strip() for x in open(discourse)]
posList = [x.strip() for x in open(positive)]
negList = [x.strip() for x in open(negative)]
proList = [x.strip() for x in open(bad)]
modals = ['can', 'could', 'may', 'might', 'must', 'will', 'would', 'should']

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hind\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hind\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [2]:
#Function to clean the dataset
#credit goes to :https://github.com/LoLei/redditcleaner
def clean(text, newline=True, quote=True, bullet_point=True, 
          link=True, strikethrough=True, spoiler=True,
          code=True, superscript=True, table=True, heading=True):
    """
    Cleans text (string).
    Removes common Reddit special characters/symbols:
      * \n (newlines)
      * &gt; (> quotes)
      * * or &amp;#x200B; (bullet points)
      * []() (links)
      * etc (see below)
    Specific removals can be turned off, but everything is on by default.
    Standard punctuation etc is deliberately not removed, can be done in a
    second round manually, or may be preserved in any case.
    """
    # Newlines (replaced with space to preserve cases like word1\nword2)
    if newline:
        text = re.sub(r'\n+', ' ', text)

        # Remove resulting ' '
        text = text.strip()
        text = re.sub(r'\s\s+', ' ', text)

    # > Quotes
    if quote:
        text = re.sub(r'\"?\\?&?gt;?', '', text)

    # Bullet points/asterisk (bold/italic)
    if bullet_point:
        text = re.sub(r'\*', '', text)
        text = re.sub('&amp;#x200B;', '', text)

    # []() Link (Also removes the hyperlink)
    if link:
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    # Strikethrough
    if strikethrough:
        text = re.sub('~', '', text)

    # Spoiler, which is used with < less-than (Preserves the text)
    if spoiler:
        text = re.sub('&lt;', '', text)
        text = re.sub(r'!(.*?)!', r'\1', text)

    # Code, inline and block
    if code:
        text = re.sub('`', '', text)

    # Superscript (Preserves the text)
    if superscript:
        text = re.sub(r'\^\((.*?)\)', r'\1', text)

    # Table
    if table:
        text = re.sub(r'\|', ' ', text)
        text = re.sub(':-', '', text)

    # Heading
    if heading:
        text = re.sub('#', '', text)

    return text

def cleanTokens(text): 
    stemmer = WordNetLemmatizer()
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(text))
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    # Converting to Lowercase
    document = document.lower()
    # Lemmatization
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    return document


def cleanVecs(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [3]:
#Read, clean, and fix the dataset, then split it to training and testing
def generateData(datasetFlag,subFlag):
    train = pd.read_csv(training_dataset)
    train['comment_raw'] = train['comment_text']
    train['comment_text'] = train['comment_text'].apply(lambda x: clean(x))
    s = train[['highly_toxic','slightly_toxic','non_toxic']]
    train['category'] = pd.get_dummies(s).idxmax(1)
    train['label'] =train['category'].map({'non_toxic':0, 'slightly_toxic':1,'highly_toxic':2})
    train = train.sample(frac=1,random_state=SEED)
    print("Original shape before preprocessing:",train.shape)

    train['brief_cleaning'] = train['comment_text'].map(lambda x: re.sub("[^A-Za-z']+", ' ', str(x)).lower()) 
    t = time()
    txt = [cleanVecs(doc) for doc in nlp.pipe(train.brief_cleaning.tolist(), batch_size=5000, n_process=-1)]

    print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
    train['clean'] = txt
    # df_clean = pd.DataFrame({'clean': txt})
    train = train.dropna()
    train = train.drop_duplicates(subset='clean', keep='first')
    train = train.reset_index(drop=True)
    print(train['comment_text'].head())
    print("Shape after preprocessing:",train.shape)
    
    if datasetFlag == 1:
        print("Yielding data for BoW")
        train['comment_text'] = train['comment_text'].map(lambda x: cleanTokens(x))
        x = train['comment_text']
                
    elif datasetFlag == 2:
        x = train
        if subFlag==1:
            print("Yielding data for W2V")  
        elif subFlag==2:
            print("Yielding data for D2V")
    
    elif datasetFlag == 3:
        print("Yielding data for Hateful features")
        x = train['comment_raw']
    
    y = train['label']
    printNumbers(y)
    y = train['label'].to_numpy()
    y = y.reshape(-1,1)
    with open(feat_dir+'target.pkl','wb') as f: pickle.dump(y, f)
    return x, y
      
    
def printNumbers(y):
    nontox, slightly, highly = np.bincount(y)
    total = nontox + slightly + highly
    print('Total: {}\n    Non toxic: {} ({:.2f}% of total)\n'.format(
        total, nontox, 100 * nontox / total))
    print('Total: {}\n    Slightly toxic: {} ({:.2f}% of total)\n'.format(
        total, slightly, 100 * slightly / total))
    print('Total: {}\n    Highly toxic: {} ({:.2f}% of total)\n'.format(
        total, highly, 100 * highly / total))

In [4]:
#Create word vectroizer instance, then fit it on training and testing splits
#Unigram
def generateBOWFeats(datasetFlag):
    subFlag = 0
    x, y = generateData(datasetFlag,subFlag)
    tf_vectorizer = CountVectorizer(
        stop_words=None,
        strip_accents='unicode',
        token_pattern=r'\w{2,}', #accept tokens that have 1 or more characters
        analyzer='word',
        ngram_range=(1, 1),
        min_df=1,
        max_features=FEAT)

    # fit data
    tf = tf_vectorizer.fit_transform(x).toarray()
    with open(feat_dir+'tf.pkl','wb') as f: pickle.dump(tf, f)
    with open(feat_dir+'tf.pkl','rb') as f: arrayname1 = pickle.load(f)
    print(np.array_equal(tf,arrayname1)) #sanity check

    #Bigram
    tfbig_vectorizer = CountVectorizer(
        stop_words=None,
        strip_accents='unicode',
        token_pattern=r'\w{2,}', #accept tokens that have 1 or more characters
        analyzer='word',
        ngram_range=(2, 2),
        min_df=1,
        max_features=FEAT)

    # fit data
    tfbig = tfbig_vectorizer.fit_transform(x).toarray()
    with open(feat_dir+'tfbig.pkl','wb') as f: pickle.dump(tfbig, f)
    with open(feat_dir+'tfbig.pkl','rb') as f: arrayname1 = pickle.load(f)
    print(np.array_equal(tfbig,arrayname1)) #sanity check

    #Ngram
    tfn_vectorizer = CountVectorizer(
        stop_words=None,
        strip_accents='unicode',
        token_pattern=r'\w{2,}', #accept tokens that have 1 or more characters
        analyzer='word',
        ngram_range=(3, 5),
        min_df=1,
        max_features=FEAT)

    # fit data
    tfn = tfn_vectorizer.fit_transform(x).toarray()  
    with open(feat_dir+'tfn.pkl','wb') as f: pickle.dump(tfn, f)
    with open(feat_dir+'tfn.pkl','rb') as f: arrayname1 = pickle.load(f)
    np.array_equal(tfn,arrayname1) #sanity check

    #TFIDF
    tfidf_vectorizer = TfidfVectorizer(
        stop_words=None,
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{2,}',  #vectorize 2-character words or more
        ngram_range=(1, 1),
        max_features=FEAT)

    # fit data
    tfidf = tfidf_vectorizer.fit_transform(x).toarray()
    with open(feat_dir+'tfidf.pkl','wb') as f: pickle.dump(tfidf, f)
    with open(feat_dir+'tfidf.pkl','rb') as f: arrayname1 = pickle.load(f)
    print(np.array_equal(tfidf,arrayname1)) #sanity check
    

In [5]:
#Compute w2v and d2v features
def generateGensimFeats(datasetFlag, subFlag):
    if subFlag == 1:
        x, y = generateData(datasetFlag,subFlag)
        train = pd.DataFrame()
        train = x
        sent = [row.split() for row in train['clean']]
        print(len(sent))
        phrases = Phrases(sent, min_count=2, progress_per=100)
        bigram = Phraser(phrases)
        sentences = bigram[sent]
        com_tokens = train['clean'].apply(lambda x: x.split()) # tokenizing 
        w2v_model = Word2Vec(min_count=2,
                             window=10,
                             vector_size=DIMS,
                             hs=1,
                             sg=1,
                             negative=10,
                             seed = SEED,
                             workers=cores-1)
        print("Building w2v vocabulary")
        w2v_model.build_vocab(sentences, progress_per=1000)
        #print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
        name = feat_dir+"word2vec-model.bin"
        if os.path.isfile(name):
            print("Model exists, no need to train")
            print("The model will be leaded")
            w2v_model = Word2Vec.load(name)
            print(w2v_model.wv.most_similar(positive=["fuck"]))
            print(len(w2v_model.wv['fuck']))
            print("Vocab size:",len(w2v_model.wv))
        else:
            print("There is no model. The model will be trained now:")    
            t = time()
            w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=15, report_delay=1)
            print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
            #saving and loading the model
            w2v_model.save(name)
            print(w2v_model.wv.most_similar(positive=["fuck"]))
            print(len(w2v_model.wv['fuck']))
            print("Vocab size:",len(w2v_model.wv))


        def word_vector(tokens, size):
            vec = np.zeros(size).reshape((1, size))
            count = 0
            for word in tokens:
                try:
                    vec +=w2v_model.wv[word].reshape((1, size))
                    count += 1.
                except KeyError:  # handling the case where the token is not in vocabulary
                    continue
            if count != 0:
                vec /= count
            return vec

        wordvec_arrays = np.zeros((len(y), DIMS)) 
        
        for i in range(len(y)):  
            wordvec_arrays[i,:] = word_vector(com_tokens[i], DIMS)    
        wordvec_df = pd.DataFrame(wordvec_arrays)
        print("Trained vectors shape:",wordvec_df.shape)
        name = feat_dir+"word2vec-features.pkl"
        wordvec_df.to_pickle(name)
        del wordvec_df
        del wordvec_arrays
        gc.collect()
        
    elif subFlag == 2:
        x, y = generateData(datasetFlag,subFlag)
        # #Prepare input for doc2vec model
        train = pd.DataFrame()
        train = x
        com_tokens = train['clean'].apply(lambda x: x.split()) # tokenizing 
        doc_df = pd.DataFrame(com_tokens,columns=['clean'])
        doc_df['id'] = train['id']
        doc_df['label'] = train['label']

        #Prepare doc2vector data
        labeled_coms = doc_df.apply(lambda x: TaggedDocument(words=x.clean, tags=[x.id]), axis=1)

        #Building doc2vector model
        d2v_model=Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                  dm_mean=1, # dm_mean = 1 for using mean of the context word vectors
                  vector_size=DIMS, # no. of desired features
                  window=15, # width of the context window                                  
                  negative=7, # if > 0 then negative sampling will be used
                  min_count=2, # Ignores all words with total frequency lower than 5.                                  
                  workers=cores-1, # no. of cores                                  
                  alpha=0.03, # learning rate   
        #           alpha=0.0025, 
        #           min_alpha=0.000001, 
                  seed = SEED # for reproducibility
                 ) 
        print("Building D2V vocabulary")
        d2v_model.build_vocab([i for i in tqdm(labeled_coms)])
        #print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
        
        name = feat_dir+"doc2vec-model.bin"
        if os.path.isfile(name):
            print("Model exists, no need to train")
            print("The model will be leaded")
            d2v_model = Doc2Vec.load(name)
            print(d2v_model.dv['delfqkc'])
            print(len(d2v_model.dv['delfqkc']))
            print("Vocab size:",len(d2v_model.wv))
        else:
            print("There is no model. The model will be trained now:")    
            t = time()
            d2v_model.train(labeled_coms, total_examples=d2v_model.corpus_count, epochs=15, report_delay=1)
            print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
            #saving and loading the model
            d2v_model.save(name)
            print(d2v_model.dv['delfqkc'])
            print(len(d2v_model.dv['delfqkc']))
            print("Vocab size:",len(d2v_model.wv))

        docvec_arrays = np.zeros((len(com_tokens), DIMS)) 
        for i in range(len(train)):
            docvec_arrays[i,:] = d2v_model.dv[i].reshape((1,DIMS))    

        docvec_df = pd.DataFrame(docvec_arrays) 
        docvec_df.shape
        print("Trained vectors shape:",docvec_df.shape)     
        name = feat_dir+"doc2vec-features.pkl"
        docvec_df.to_pickle(name)
        
        del docvec_df
        del docvec_arrays
        gc.collect()
    gc.collect()

In [6]:
#Compute hateful features
def count_repeated(text):
    repeated_threshold = 1
    text_splitted = text.split()
    word_counts = Counter([x for x in text if x in string.punctuation])
    countPunkts = [item for item in list(word_counts.values()) if item > repeated_threshold]
    return len(countPunkts)

def count_unknown(text):
    spellCounter = 0
    text = text.encode("ascii", "ignore").decode()
    for ele in text:
        if ele in string.punctuation:
            text = text.replace(ele, "")
    #Number of unknown words
    for word in text.strip().split():
        if d.check(word) is False:
            #print(word)
            spellCounter += 1
    return spellCounter

def tag_part_of_speech(text):
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    pos_list = pos_tag(text_splited)
    noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
    adjective_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
    verb_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
    return[noun_count, adjective_count, verb_count]

def generateHateFeats(datasetFlag):
    subFlag = 0
    x, y = generateData(datasetFlag,subFlag)
    train = pd.DataFrame()
    train['comment_text'] = x
    train['total_length'] = train['comment_text'].apply(len)
    train['averageWords'] = train['comment_text'].apply(lambda comment: sum(len(word) for word in comment.split())/len(comment.split()))
    train['words'] = train['comment_text'].apply(lambda comment: len(comment.split()))
    train['words_vs_length'] = train['words'] / train['total_length']

    train['capitals'] = train['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    train['capitals_vs_length'] = train['capitals'] / train['total_length']
    train['capitals_vs_words'] = train['capitals'] / train['words']

    eng_stopwords = set(stopwords.words("english"))
    train['stopwords'] = train['comment_text'].apply(lambda comment: sum(comment.count(w) for w in eng_stopwords))
    train['stopwords_vs_length'] = train['stopwords'] / train['total_length']
    train['stopwords_vs_words'] = train['stopwords'] / train['words']

    train['punctuation'] = train['comment_text'].apply(
        lambda comment: sum(comment.count(w) for w in string.punctuation))
    train['punctuation_vs_length'] = train['punctuation'] / train['total_length']
    train['punctuation_vs_words'] = train['punctuation'] / train['words']
    train['period'] = train['comment_text'].apply(lambda comment: comment.count('.'))
   
    train['quote'] = train['comment_text'].apply(lambda comment: len([w for w in comment if w=='"' or w=="'"]))
    train['unique_words'] = train['comment_text'].apply(
        lambda comment: len(set(w for w in comment.split())))
    train['unique_words_vs_length'] = train['unique_words'] / train['total_length']
    train['unique_words_vs_words'] = train['unique_words'] / train['words']

    train['nonAlpha'] = train['comment_text'].str.count(r'[^a-zA-Z0-9 ]')

    train['repeatedPunct'] = train['comment_text'].apply(lambda comment: count_repeated(comment))

    train['discourse'] = train['comment_text'].apply(lambda comment:len([w.lower() for w in comment.split() if w in disList]))

    train['politeness'] = train['comment_text'].apply(lambda comment:len([w.lower() for w in comment.split() if w in posList]))

    train['rudeness'] = train['comment_text'].apply(lambda comment:len([w.lower() for w in comment.split() if w in negList]))
    train['singleTokens'] = train['comment_text'].apply(lambda comment:len([w for w in comment.split() if len(w)==1]))
    train['modCount'] = train['comment_text'].apply(lambda comment:len([w.lower() for w in comment.split() if w in modals]))    
    train['unknownWords'] = train['comment_text'].apply(lambda comment: count_unknown(comment))

    train['profaneCount'] = train['comment_text'].apply(lambda comment:len([w.lower() for w in comment.split() if w in bad]))
    train['profaneRatio'] = train['profaneCount'] / train['words']  

    train['nouns'], train['adjectives'], train['verbs'] = zip(*train['comment_text'].apply(
        lambda comment: tag_part_of_speech(comment)))

    train['nouns_vs_length'] = train['nouns'] / train['total_length']
    train['adjectives_vs_length'] = train['adjectives'] / train['total_length']
    train['verbs_vs_length'] = train['verbs'] / train['total_length']
    train['nouns_vs_words'] = train['nouns'] / train['words']
    train['adjectives_vs_words'] = train['adjectives'] / train['words']
    train['verbs_vs_words'] = train['verbs'] / train['words']
    train = train.drop('comment_text', 1)
    print(train.head())
    name = feat_dir+"hatefulFeats.pkl"
    train.to_pickle(name)


In [7]:
def combineAllFeatures():
    featureFiles = {'tf.pkl':'Unigram','tfbig.pkl':'Bigram','tfn.pkl':'Ngram','tfidf.pkl':'TFIDF',\
        'hatefulFeats.pkl':'Hateful','word2vec-features.pkl':'Word2Vec','doc2vec-features.pkl':'Doc2Vec'}
    lst = []
    for filename, title in featureFiles.items():   
        with open(feat_dir+filename,'rb') as f: train = pickle.load(f)
        print(train.shape)
        lst.append(train)
  
    result = np.hstack(lst)
    print(result.shape)
    name = feat_dir+"allFeatures.pkl"
    with open(name,'wb') as f: pickle.dump(result, f)

In [8]:
def main():
    generateBOWFeats(1)
    generateGensimFeats(2, 1)
    generateGensimFeats(2, 2)
    generateHateFeats(3)
    combineAllFeatures()
main()
del nlp
gc.collect()

Original shape before preprocessing: (10083, 8)


INFO - 02:52:38: NumExpr defaulting to 8 threads.


Time to clean up everything: 0.59 mins
0    With my SO (30 years together), this kind of q...
1    I feel like this is a sign of how young Americ...
2    I was once interviewed as a character witness ...
3    RJ Reynolds tobacco (Winston's, and Camel), Kr...
4    Lol, so you didn't take the transfer that's li...
Name: comment_text, dtype: object
Shape after preprocessing: (10065, 10)
Yielding data for BoW
Total: 10065
    Non toxic: 8210 (81.57% of total)

Total: 10065
    Slightly toxic: 1189 (11.81% of total)

Total: 10065
    Highly toxic: 666 (6.62% of total)

True
True
True
Original shape before preprocessing: (10083, 8)


INFO - 02:53:30: collecting all words and their counts
INFO - 02:53:30: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 02:53:30: PROGRESS: at sentence #100, processed 2519 words and 3628 word types
INFO - 02:53:30: PROGRESS: at sentence #200, processed 5961 words and 7935 word types
INFO - 02:53:30: PROGRESS: at sentence #300, processed 8874 words and 11292 word types
INFO - 02:53:30: PROGRESS: at sentence #400, processed 11689 words and 14457 word types
INFO - 02:53:30: PROGRESS: at sentence #500, processed 14742 words and 17738 word types
INFO - 02:53:30: PROGRESS: at sentence #600, processed 17588 words and 20653 word types
INFO - 02:53:30: PROGRESS: at sentence #700, processed 20656 words and 23942 word types
INFO - 02:53:30: PROGRESS: at sentence #800, processed 24018 words and 27375 word types
INFO - 02:53:30: PROGRESS: at sentence #900, processed 26737 words and 30122 word types
INFO - 02:53:30: PROGRESS: at sentence #1000, processed 29445 words and 32759 wo

Time to clean up everything: 0.57 mins
0    With my SO (30 years together), this kind of q...
1    I feel like this is a sign of how young Americ...
2    I was once interviewed as a character witness ...
3    RJ Reynolds tobacco (Winston's, and Camel), Kr...
4    Lol, so you didn't take the transfer that's li...
Name: comment_text, dtype: object
Shape after preprocessing: (10065, 10)
Yielding data for W2V
Total: 10065
    Non toxic: 8210 (81.57% of total)

Total: 10065
    Slightly toxic: 1189 (11.81% of total)

Total: 10065
    Highly toxic: 666 (6.62% of total)

10065


INFO - 02:53:30: PROGRESS: at sentence #2800, processed 84492 words and 82765 word types
INFO - 02:53:30: PROGRESS: at sentence #2900, processed 87344 words and 85253 word types
INFO - 02:53:30: PROGRESS: at sentence #3000, processed 90063 words and 87601 word types
INFO - 02:53:30: PROGRESS: at sentence #3100, processed 92890 words and 90033 word types
INFO - 02:53:30: PROGRESS: at sentence #3200, processed 95738 words and 92511 word types
INFO - 02:53:30: PROGRESS: at sentence #3300, processed 98632 words and 94942 word types
INFO - 02:53:30: PROGRESS: at sentence #3400, processed 101014 words and 96923 word types
INFO - 02:53:30: PROGRESS: at sentence #3500, processed 104033 words and 99437 word types
INFO - 02:53:30: PROGRESS: at sentence #3600, processed 107006 words and 101892 word types
INFO - 02:53:30: PROGRESS: at sentence #3700, processed 109945 words and 104481 word types
INFO - 02:53:30: PROGRESS: at sentence #3800, processed 113258 words and 107223 word types
INFO - 02:53:

INFO - 02:53:31: PROGRESS: at sentence #2000, processed 57225 words, keeping 10652 word types
INFO - 02:53:31: PROGRESS: at sentence #3000, processed 85397 words, keeping 13256 word types
INFO - 02:53:31: PROGRESS: at sentence #4000, processed 112979 words, keeping 15379 word types
INFO - 02:53:31: PROGRESS: at sentence #5000, processed 140987 words, keeping 17160 word types
INFO - 02:53:31: PROGRESS: at sentence #6000, processed 168517 words, keeping 18782 word types
INFO - 02:53:31: PROGRESS: at sentence #7000, processed 195622 words, keeping 20075 word types


Building w2v vocabulary


INFO - 02:53:31: PROGRESS: at sentence #8000, processed 223597 words, keeping 21337 word types
INFO - 02:53:31: PROGRESS: at sentence #9000, processed 248880 words, keeping 22374 word types
INFO - 02:53:31: PROGRESS: at sentence #10000, processed 277142 words, keeping 23474 word types
INFO - 02:53:31: collected 23540 word types from a corpus of 278894 raw words and 10065 sentences
INFO - 02:53:31: Creating a fresh vocabulary
INFO - 02:53:31: Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 13913 unique words (59.103653355989806%% of original 23540, drops 9627)', 'datetime': '2022-04-30T02:53:31.783172', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 02:53:31: Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 269267 word corpus (96.54815091038172%% of original 278894, drops 9627)', 'datetime': '2022-04-30T02:53:31.784169', 'gensim': '4

There is no model. The model will be trained now:


INFO - 02:53:33: EPOCH 1 - PROGRESS: at 31.43% examples, 74048 words/s, in_qsize 13, out_qsize 0
INFO - 02:53:34: worker thread finished; awaiting finish of 6 more threads
INFO - 02:53:34: worker thread finished; awaiting finish of 5 more threads
INFO - 02:53:34: worker thread finished; awaiting finish of 4 more threads
INFO - 02:53:34: EPOCH 1 - PROGRESS: at 89.62% examples, 107524 words/s, in_qsize 3, out_qsize 1
INFO - 02:53:34: worker thread finished; awaiting finish of 3 more threads
INFO - 02:53:34: worker thread finished; awaiting finish of 2 more threads
INFO - 02:53:34: worker thread finished; awaiting finish of 1 more threads
INFO - 02:53:34: worker thread finished; awaiting finish of 0 more threads
INFO - 02:53:34: EPOCH - 1 : training on 278894 raw words (254133 effective words) took 2.2s, 118161 effective words/s
INFO - 02:53:35: EPOCH 2 - PROGRESS: at 31.43% examples, 79865 words/s, in_qsize 13, out_qsize 0
INFO - 02:53:36: worker thread finished; awaiting finish of 6 mor

INFO - 02:53:56: EPOCH 11 - PROGRESS: at 31.43% examples, 65459 words/s, in_qsize 13, out_qsize 0
INFO - 02:53:57: worker thread finished; awaiting finish of 6 more threads
INFO - 02:53:57: EPOCH 11 - PROGRESS: at 81.81% examples, 91016 words/s, in_qsize 5, out_qsize 1
INFO - 02:53:57: worker thread finished; awaiting finish of 5 more threads
INFO - 02:53:57: worker thread finished; awaiting finish of 4 more threads
INFO - 02:53:58: worker thread finished; awaiting finish of 3 more threads
INFO - 02:53:58: worker thread finished; awaiting finish of 2 more threads
INFO - 02:53:58: worker thread finished; awaiting finish of 1 more threads
INFO - 02:53:58: worker thread finished; awaiting finish of 0 more threads
INFO - 02:53:58: EPOCH - 11 : training on 278894 raw words (254384 effective words) took 2.5s, 102841 effective words/s
INFO - 02:53:59: EPOCH 12 - PROGRESS: at 31.43% examples, 70179 words/s, in_qsize 13, out_qsize 0
INFO - 02:54:00: worker thread finished; awaiting finish of 6 

Time to train the model: 0.59 mins
[('srsly', 0.4781668484210968), ('negan', 0.40163785219192505), ('rephrase', 0.3981514871120453), ('academy', 0.3854159116744995), ('them_pregnant', 0.38458365201950073), ('scientology', 0.3814558982849121), ('pull_pant', 0.38130810856819153), ('rockstar', 0.3808191418647766), ('precedent', 0.3745284378528595), ('barbie', 0.37427619099617004)]
300
Vocab size: 13913
Trained vectors shape: (10065, 300)
Original shape before preprocessing: (10083, 8)
Time to clean up everything: 0.57 mins
0    With my SO (30 years together), this kind of q...
1    I feel like this is a sign of how young Americ...
2    I was once interviewed as a character witness ...
3    RJ Reynolds tobacco (Winston's, and Camel), Kr...
4    Lol, so you didn't take the transfer that's li...
Name: comment_text, dtype: object
Shape after preprocessing: (10065, 10)
Yielding data for D2V
Total: 10065
    Non toxic: 8210 (81.57% of total)

Total: 10065
    Slightly toxic: 1189 (11.81% of tot

INFO - 02:54:44: Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d300,n7,w15,mc2,s0.001,t7)', 'datetime': '2022-04-30T02:54:44.022947', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
100%|███████████████████████████████████████████████████████████████████████| 10065/10065 [00:00<00:00, 2018729.43it/s]
INFO - 02:54:44: collecting all words and their counts
INFO - 02:54:44: PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO - 02:54:44: PROGRESS: at example #10000, processed 292213 words (3989087/s), 21101 word types, 10000 tags
INFO - 02:54:44: collected 21166 word types and 10065 unique tags from a corpus of 10065 examples and 294060 words
INFO - 02:54:44: Creating a fresh vocabulary
INFO - 02:54:44: Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 11630 unique words (54.946612491732026%% of original 21166, drops 9536)', 'datetime': '20

Building D2V vocabulary


INFO - 02:54:44: deleting the raw counts dictionary of 21166 items
INFO - 02:54:44: sample=0.001 downsamples 33 most-common words
INFO - 02:54:44: Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 267843.69758218 word corpus (94.1%% of prior 284524)', 'datetime': '2022-04-30T02:54:44.243357', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 02:54:44: estimated required memory for 11630 words and 300 dimensions: 47818000 bytes
INFO - 02:54:44: resetting layer weights
INFO - 02:54:44: Doc2Vec lifecycle event {'msg': 'training model with 7 workers on 11630 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=7 window=15 shrink_windows=True', 'datetime': '2022-04-30T02:54:44.353087', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}


There is no model. The model will be trained now:


INFO - 02:54:45: worker thread finished; awaiting finish of 6 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 5 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 4 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 3 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 2 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 1 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 0 more threads
INFO - 02:54:45: EPOCH - 1 : training on 294060 raw words (277822 effective words) took 0.8s, 341008 effective words/s
INFO - 02:54:45: worker thread finished; awaiting finish of 6 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 5 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 4 more threads
INFO - 02:54:45: worker thread finished; awaiting finish of 3 more threads
INFO - 02:54:45: worker thread finished; awaiting finish

INFO - 02:54:55: EPOCH - 13 : training on 294060 raw words (277760 effective words) took 0.8s, 340738 effective words/s
INFO - 02:54:55: worker thread finished; awaiting finish of 6 more threads
INFO - 02:54:55: worker thread finished; awaiting finish of 5 more threads
INFO - 02:54:55: worker thread finished; awaiting finish of 4 more threads
INFO - 02:54:55: worker thread finished; awaiting finish of 3 more threads
INFO - 02:54:55: worker thread finished; awaiting finish of 2 more threads
INFO - 02:54:56: worker thread finished; awaiting finish of 1 more threads
INFO - 02:54:56: worker thread finished; awaiting finish of 0 more threads
INFO - 02:54:56: EPOCH - 14 : training on 294060 raw words (277857 effective words) took 0.8s, 341586 effective words/s
INFO - 02:54:56: worker thread finished; awaiting finish of 6 more threads
INFO - 02:54:56: worker thread finished; awaiting finish of 5 more threads
INFO - 02:54:56: worker thread finished; awaiting finish of 4 more threads
INFO - 02:

Time to train the model: 0.21 mins
[-0.04283398  0.00079077  0.0775322  -0.02957144  0.05562497 -0.00698017
  0.0186095  -0.01249805 -0.03274334  0.06288268  0.023407   -0.09177446
  0.04897289 -0.06626798 -0.10562522  0.08817029 -0.00785297  0.03037127
 -0.0091265   0.00599494  0.03655962 -0.0601322   0.1036763  -0.05080881
 -0.01161645 -0.00674384  0.06353684 -0.01104383  0.00789816  0.03166428
 -0.03410955 -0.02782872  0.03747762  0.12158677  0.04323016 -0.02921532
  0.04462312  0.02790437 -0.03779616 -0.03516443 -0.07907736  0.10882051
 -0.00171213  0.06894293  0.08435101 -0.04271301 -0.04076056 -0.12134328
  0.01811928 -0.02891283  0.02218465 -0.03312704 -0.0093756  -0.04591612
  0.02166417 -0.02674035  0.02390974 -0.00349059  0.02063171 -0.04118214
  0.01249478  0.02039344 -0.10467897  0.06346711 -0.06833743  0.04164757
  0.10436503 -0.08953945 -0.00787543 -0.01388591 -0.0160646  -0.08648466
  0.01331451  0.14527611  0.14969955 -0.05445182  0.00193428  0.04981213
 -0.02860853  0.

6408