# Preprocessing data for the bag of words approach

This notebooks performs preprocessing on the Quora datasets for the bag of words approach. This means that we do not take into account the text or sequence of words itself, but information such as punctuation, length, etc. 

In [None]:
import sys
sys.path.append("../..")
import numpy as np
import pandas as pd

from common.nlp.feature_adder import FeatureAdder

In [None]:
# load data with right data types (this is important for the IDs in particular)
dtypes = {"qid": str, "question_text": str, "target": int}
train = pd.read_csv("../data/train.csv", dtype=dtypes)
test = pd.read_csv("../data/test.csv", dtype=dtypes)

## Add features to the datasets
The `common.nlp.feature_adder` module is used to subtract features from the text. This takes around 20 minutes. Therefore, it could be beneficial to evaluate with experiments which features are usefull and which are not.

Since we cannot external data, we define a list of badwords ourselves. In the section 'Create a list of bad words' it is explained how we get to this list of bad words. 

In [None]:
badwords = ['ahole', 'asshole', 'bareback', 'bastard', 'beastial', 'bestial', 'big black', 'bitch', 'black cock', 'chink', 
            'cocks', 'creampie', 'cunt', 'dick', 'feck', 'fondle', 'fuc', 'gays', 'golden shower', 'incest', 'jackass', 'lesbians',
            'lusty', 'moron', 'pedophilia', 'pricks', 'puss', 'raped', 'raping', 'scum', 'shit', 'sissy', 'sluts', 'sodom', 'tits', 
            'tranny', 'transsexual', 'whore']

In [None]:
# Get the dense features
fa_params = {
    "data_dir": 'Data/',
    "upper_case": True,
    "word_count": True,
    "unique_words_count": True,
    "letter_count": True,
    "punctuation_count": True,
    "little_case": True,
    "stopwords": True, #this is using a list which is in the nltk package (external data), so I dont think we can use it.
    "question_or_exclamation": True,
    "number_bad_words": True,
    "sentiment_analysis": True,
    "badwords": badwords,
    "text_column": "question_text"
}
fa = FeatureAdder(**fa_params)

train_extended, test_extended = fa.get_features(train, test, load = False, save = False)

## Create a list of bad words
Since we cannot use external data it is not possible to use an list of bad words from the internet. What we can do it create one ourselve with one the most important bad words. The main question here is: how do we define the most important bad words for this competition. 

We start with a list of bad words that is used by google. Then we perform the following steps:
- For each google bad word, we count how many times it occurs and how many times the text in which the word occurs is classified as insincere.  
- Select only bad words that occur in the train set and are classified as insincere in more than X% of the cases
- Drop duplicate bad words 
- Select only words that are not already captured by another selected bad word. 
- Keep only bad words that occur in the test dataset

The selected words

NOTE! It takes around 30 minutes to run this code. So, I recommend you to test it on a part of the dataset or with only a part of the google bad words. The resulting badwords_quora.pkl file can be found in the shared surfdrive. 

In [None]:
google_badwords = pd.read_csv('../data/google_badwords.csv', encoding = 'ISO-8859-1', header = None, sep = ';')[[0]]
# load data with right data types (this is important for the IDs in particular)
dtypes = {"qid": str, "question_text": str, "target": int}
train = pd.read_csv("../data/train.csv", dtype=dtypes)
test = pd.read_csv("../data/test.csv", dtype=dtypes)

In [None]:
# Count how many times all words are occuring in the train set and the percentage classified as insincere
occurance = [] # A list with for each google bad word the number of times it occurs in the training set
classification = [] # A list with for each google bad words the probability of being classified as insincere

for word in google_badwords[0]:
    # Check for each row if the word occurs
    occur = train['question_text'].apply(lambda x: word in x)
    # Select only the data in which the word occurs
    select = occur[occur == True]
    
    if len(select) > 0: 
        index = select.index
        occurance.append(len(index))
        classification.append(list(train.iloc[index,:]['target']).count(1)/len(index))
    else:
        occurance.append(0)
        classification.append(0)

In [None]:
google_badwords['occurance'] = occurance
google_badwords['%insincere'] = classification
google_badwords.to_pickle('data/badwords_quora.pkl')

In [None]:
# Import the file
google_badwords = pd.read_pickle('data/badwords_quora.pkl')

In [None]:
# Select only words that are classified often as insincere
selected_words_train = list(google_badwords[(google_badwords['occurance'] >= 0) & (google_badwords['%insincere'] > 0.55)][0])
print(len(selected_words_train)) 

In [None]:
# Drop duplicate words
selected_words_train = list(set(selected_words_train))
print(len(selected_words_train))

In [None]:
# Select only words that are not already captured by another selected word. 
# For example 'motherfuck' and 'motherfucking' are amongst the list of bad words. Then 'motherfucking' is redundant, 
# since 'motherfucking' is already captured by 'motherfuck'
remove_words = []
for word1 in selected_words_train:
    for word2 in selected_words_train:
        if word1 != word2:
            if word1 in word2:
                remove_words.append(word2)
selected_words_train = list(set(selected_words_train) - set(remove_words))
print(len(selected_words_train))

In [None]:
# Keep only bad words that occur in the test dataset
selected_words = []
for word in selected_words_train:
    occur = test['question_text'].apply(lambda x: word in x)
    select = occur[occur == True]
    if len(select) > 0: 
        selected_words.append(word)
print(len(selected_words))

In [None]:
# Order the words alphabeticaly
selected_words.sort()