# Extract Allowed Words
To make the other dataset more similar to the train.csv + trial.csv, we filter the most common words out of them and build a whitelist. Only these words are allowed, other words are filtered out.

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer

train = pd.read_csv('../datasets/train_pre.csv', usecols=[1])
trial = pd.read_csv('../datasets/trial_pre.csv', usecols=[1])
all = pd.concat([train, trial]).reset_index(drop=True)
all = ' . '.join([str(x) for x in all['content'].tolist()]).lower()

from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Lemmatize words
lemmatizer = WordNetLemmatizer()
all = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(all)])

In [None]:
import nltk
from nltk.probability import FreqDist

tokens = nltk.tokenize.word_tokenize(all)
fdist = FreqDist(tokens)

print(f'Total number of words: {len(fdist.items())}')
print(f'20 Most common words: {fdist.most_common(20)}')

In [None]:
# Words that are used atleast twice
more = [w for w, i in fdist.items() if i > 4]
more = pd.Series(more)
more.to_csv('../datasets/whitelist/whitelist1.csv', index=False, header=False)

# Words that are only used once
once = [w for w,i in fdist.items() if i <= 4]
once = pd.Series(once)
once.to_csv('../datasets/whitelist/whitelist2.csv', index=False, header=False)

In [None]:
# Create whitelist
white1 = pd.read_csv('../datasets/whitelist/whitelist1.csv', names=['content'])
white2 = pd.read_csv('../datasets/whitelist/whitelist2.csv', names=['content'])
white = pd.concat([white1, white2]).reset_index(drop=True)
white = white.sample(frac=1).reset_index(drop=True)
white = list(set(white['content'].tolist()))
white = pd.DataFrame(white)
white.to_csv('../datasets/whitelist/whitelist.csv', index=False, header=False)