In [2]:
import pandas as pd
import numpy as np

# Prepocessing libraries
import emoji
import re
from langdetect import detect
from spellchecker import SpellChecker
import string
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.parsing.preprocessing import preprocess_string
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
import itertools

# Visualization libraries
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm

In [3]:
def detect_lang(text):
    """
    Detects the language of a story.

    Parameters
    ----------
    text : str
        The story to be processed.
    Returns
    -------
    lang : str
        The language of the story.
    """
    try:
        return detect(text)
    except:
        return 'unknown'
    
stop_words = set(stopwords.words('english'))
punc = set(string.punctuation)
lemma = WordNetLemmatizer()
tokenizer = WordPunctTokenizer()
spellcheck = SpellChecker()

def preprocess(text):
    """
    Preprocesses a story by removing emojis, punctuations, stopwords, spellchecking and lemmatizing the words.

    Parameters
    ----------
    text : str
        The story to be preprocessed.
    Returns
    -------
    processed_text : str
        The preprocessed story.
    """

    # regex to replace all consecutive occurences of punctuations with a single punctuation
    pattern = r'([' + re.escape(''.join(punc)) + r'])\1+'
    text = re.sub(pattern, r'\1', ''.join(text))

    # regex to remove all numbers
    text = re.sub(r'\d+', '', text)

    # tokenize the text
    tokens = tokenizer.tokenize(text.lower())
    # remove stopwords, punctuations, emojis, correct and lemmatize the words
    tokens = [spellcheck.correction(token) for token in tokens]
    tokens = [token for token in tokens if emoji.is_emoji(token) == False]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in punc]
    tokens = [lemma.lemmatize(token) for token in tokens if token]

    processed_text = ' '.join(tokens)
    return processed_text

def preprocess_without_lemmatizing(text):
    """
    Preprocesses a story by removing emojis, punctuations, stopwords, spellchecking the words.

    Parameters
    ----------
    text : str
        The story to be preprocessed.
    Returns
    -------
    processed_text : str
        The preprocessed story.
    """

    # regex to replace all consecutive occurences of punctuations with a single punctuation
    pattern = r'([' + re.escape(''.join(punc)) + r'])\1+'
    text = re.sub(pattern, r'\1', ''.join(text))

    # regex to remove all numbers
    text = re.sub(r'\d+', '', text)

    # tokenize the text
    tokens = tokenizer.tokenize(text.lower())
    # remove stopwords, punctuations, emojis, correct and lemmatize the words
    tokens = [spellcheck.correction(token) for token in tokens]
    tokens = [token for token in tokens if emoji.is_emoji(token) == False]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in punc]
    tokens = [token for token in tokens if token]

    processed_text = ' '.join(tokens)
    return processed_text

In [2]:
# read the data
file = open("data/stories.csv", "r")
stories_array = []

for line in file:
    stories_array.append(line)

file.close()

data = pd.DataFrame(stories_array, columns=['story'])

Demo: `preprocess()` on a story

In [3]:
print(data['story'][3494])
print(preprocess(data['story'][3494]))

I am an investigative journalist and did a research on the Sugar Mummy scam circus in Singapore. They all operate the same way. No one is what they say they are. I contacted 6 of the agents on Locanto and other sites via WhatsApp and they were all scammers. They might change names but one thing is for 100% sure. You will be scammed! Basically they have a pre-paid phone card with a generic profile photo. They all asure you they are not scammers. After giving them you name, age and civil status they will ask for 300-500 SGDs for a fee. They only accept bank transfer. Then when you have payed this they ask for 1400-1900 SGD for further fees and insurance. They promise you a BMW and a monthly salary of at least 10500 SGD and so on. My conclusion is "DON´T PAY ANYTHING" They are all scammers/fraudsters/liers. Don´t fall for any sweet talk or promises, you will be fooled and no sugar mummy is at the end of the rainbow. No matter who they say they are or that they have lots of clients that re

In [4]:
preprocess("I was scammed by a fake website")

'scammed fake website'

#### Data preprocessing

In [4]:
# add language column
data["language"] = data["story"].apply(detect_lang)

# filter out non-english stories
data = data[data["language"] == "en"]

# drop language column
data = data.drop(columns=["language"])

Test processing on dataframe of 5 stories

In [5]:
tqdm.pandas(desc="Preprocessing stories", colour='#ffaaff')
data["story"][0:5].progress_apply(preprocess)

  from .autonotebook import tqdm as notebook_tqdm
Preprocessing stories: 100%|[38;2;255;170;255m██████████[0m| 5/5 [00:01<00:00,  4.06it/s]


0    accepted friend request facebook common friend...
1    whatsapp message good morning baron ya receive...
2    met alan bumble claimed project manager synerg...
3    connected person named ano cab app june normal...
4    person online name june lee initially contacte...
Name: story, dtype: object

Preprocessing the data and saving it

In [6]:
preprocessed_data = data["story"].progress_apply(preprocess)
preprocessed_data.to_csv("data/preprocessed_stories_lemmatized.csv", index=False, header=False)

Preprocessing stories: 100%|[38;2;255;170;255m██████████[0m| 3489/3489 [08:01<00:00,  7.25it/s]  


Preprocessing the data without lemmatizing and saving it

In [7]:
preprocessed_data = data["story"].progress_apply(preprocess_without_lemmatizing)
preprocessed_data.to_csv("data/preprocessed_stories.csv", index=False, header=False)

Preprocessing stories: 100%|[38;2;255;170;255m██████████[0m| 3489/3489 [08:10<00:00,  7.12it/s]  


# Topic Modeling

## LDA


In [5]:


def calculate_perplexities(data, no_topics, min_df):
    """
    DEPRECATED: Not used in the project.

    Calculates the perplexities of LDA models with different number of topics.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    no_topics : int
        The maximum number of topics to be considered.
    min_df : int
        The minimum number of documents a word should appear in to be
        considered as a feature.
    Returns
    -------
    perplexities : list
        The list of perplexities of LDA models with different nu
    """

    progress_bar = tqdm(total=no_topics, desc='Calculating Perplexities', unit='model') # to show progress bar while iterating over the number of topics

    perplexities = []
    vectorized_data = CountVectorizer(min_df=min_df).fit_transform(data)

    for i in range(2, no_topics+1):
        lda = LatentDirichletAllocation(n_components=i, random_state=0)
        lda.fit(vectorized_data)
        perplexities.append(lda.perplexity(vectorized_data))
        progress_bar.update(1)
    
    progress_bar.close()
    return perplexities

# function to display the topics of a topic model
def display_topics(model, feature_names, no_top_words):
    # TODO: implement this function
    pass
            

def get_coherence_score(model, text, dictionary, coherence):
    """
    Calculates the coherence score of a topic model.

    Parameters
    ----------
    model : gensim.models.ldamodel.LdaModel
        The LDA model.
    corpus : list
        The list of preprocesses stories.
    dictionary : gensim.corpora.dictionary.Dictionary
        The dictionary of the corpus.
    Returns
    -------
    coherence_score : float
        The coherence score of the topic model.
    """

    # get the word2vec score of the topic model
    coherence_model_lda = CoherenceModel(model=model, texts=text, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

def get_cross_validation_data(data, n_topics_range, min_dfs, max_dfs, alphas, coherence):
    """
    Performs cross validation to find the best hyperparameters for the topic model.

    Parameters
    ----------
    data : pandas.Series
        The preprocessed stories.
    n_topics_range : list
        The range of number of topics to be considered.
    min_dfs : list
        The range of minimum document frequencies to be considered.
    max_dfs : list
        The range of maximum document frequencies to be considered.
    alphas : list
        The range of alpha values to be considered.
    Returns
    -------
    cross_validation_data : pandas.DataFrame
        The cross validation data.
    """

    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics

    # Grid search to find the best hyperparameters
    for no_topics in n_topics_range:
        for min_df in min_dfs:
            for max_df in max_dfs:
                for alpha in alphas:
                    min_df_abs = min_df*data.size

                    tokens_list = data["story"].str.split().to_list()
                    dictionary = Dictionary(tokens_list)
                    dictionary.filter_extremes(no_below=min_df_abs, no_above=max_df)
                    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

                    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_topics, alpha=alpha, random_state=0)
                    coherence_score = get_coherence_score(lda, tokens_list, dictionary, coherence=coherence)

                    cross_validation_data.loc[len(cross_validation_data)] = [no_topics, min_df, max_df, alpha, coherence_score]

                    progress_bar.update(1)
                    
    progress_bar.close()
    return cross_validation_data

def get_cross_validation_data_optimized(data, n_topics_range, min_dfs, max_dfs, alphas, coherence):
    cross_validation_data = pd.DataFrame(columns=['no_topics', 'min_df', 'max_df', 'alpha', coherence])
    total_iterations = len(n_topics_range) * len(min_dfs) * len(max_dfs) * len(alphas)
    progress_bar = tqdm(total=total_iterations, desc='Cross Validation', unit='model')

    parameter_combinations = itertools.product(n_topics_range, min_dfs, max_dfs, alphas)

    for parameters in parameter_combinations:
        no_topics, min_df, max_df, alpha = parameters
        min_df_abs = min_df * data.size

        tokens_list = data["story"].str.split().to_list()
        dictionary = Dictionary(tokens_list)
        dictionary.filter_extremes(no_below=min_df_abs, no_above=max_df)
        corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_topics, alpha=alpha, random_state=0)
        coherence_score = get_coherence_score(lda, tokens_list, dictionary, coherence=coherence)

        cross_validation_data.loc[len(cross_validation_data)] = [no_topics, min_df, max_df, alpha, coherence_score]
        progress_bar.update(1)

    progress_bar.close()
    return cross_validation_data

In [22]:
# Coherence score test

# Load data
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names=["story"])
data_lemmatized = pd.read_csv("data/preprocessed_stories_lemmatized.csv", header=None, names=["story"])

# tokenize the data for corpus and dictionary
tokens_list = data["story"].str.split().to_list()

# create a dictionary of the data and filter out the extremes
dictionary = Dictionary(tokens_list)
dictionary.filter_extremes(no_below=0.01*data.size, no_above=0.99)

# create a corpus of the data
corpus = [dictionary.doc2bow(text) for text in tokens_list]

# get the coherence score of the topic model
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=0, alpha=0.1)
coherence_score = get_coherence_score(lda, tokens_list, dictionary)
print(coherence_score)

0.005576748461179599


In [8]:
# test: get the cross validation data for two sets of hyperparameters
cross_validation_data = get_cross_validation_data(data, [10, 20], [0.01, 0.05], [0.99, 0.98], [0.01, 0.02])
cross_validation_data

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics


Cross Validation:   0%|          | 0/16 [00:00<?, ?model/s]

Unnamed: 0,no_topics,min_df,max_df,alpha,coherence_score
0,10.0,0.01,0.99,0.01,0.006149
1,10.0,0.01,0.99,0.02,0.006149
2,10.0,0.01,0.98,0.01,0.006149
3,10.0,0.01,0.98,0.02,0.006149
4,10.0,0.05,0.99,0.01,0.005869
5,10.0,0.05,0.99,0.02,0.005869
6,10.0,0.05,0.98,0.01,0.005869
7,10.0,0.05,0.98,0.02,0.005869
8,20.0,0.01,0.99,0.01,-0.000907
9,20.0,0.01,0.99,0.02,-0.000381


In [42]:
# Cross validation step for non-lemmatized data
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names=["story"])

min_dfs = [i*0.01 for i in range(1, 11)]
max_dfs = [i*0.01 for i in range(99, 89, -1)]
alphas = [i*0.05 for i in range(1, 21)]
no_topics_range = [i for i in range(2, 21)]

cross_validation_data = get_cross_validation_data(data, no_topics_range, min_dfs, max_dfs, alphas)

TypeError: get_cross_validation_data() missing 1 required positional argument: 'coherence'

In [28]:
cross_validation_data_lemmatized = get_cross_validation_data(data_lemmatized, no_topics_range, min_dfs, max_dfs, alphas)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(total=total_iterations, desc='Cross Validation', unit='model') # to show progress bar while iterating over the number of topics


Cross Validation:   0%|          | 0/38000 [00:00<?, ?model/s]

In [6]:
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names=["story"])

min_dfs = [0.01, 0.02]
max_dfs = [0.99, 0.98]
alphas = [0.01, 0.02]
no_topics_range = [10, 20]

cross_validation_data = get_cross_validation_data_optimized(data, no_topics_range, min_dfs, max_dfs, alphas, coherence='c_v')

cross_validation_data.to_csv("data/cross_validation_test.csv", index=False)

Cross Validation: 100%|██████████| 16/16 [00:35<00:00,  2.24s/model]


In [43]:
data = pd.read_csv("data/preprocessed_stories.csv", header=None, names=["story"])

min_dfs = [i*0.01 for i in range(1, 11)]
max_dfs = [i*0.01 for i in range(99, 89, -1)]
alphas = [i*0.05 for i in range(1, 21)]
no_topics_range = [i for i in range(2, 21)]

cross_validation_data_cv = get_cross_validation_data(data, no_topics_range, min_dfs, max_dfs, alphas, coherence='c_v')
cross_validation_data_cv.to_csv("data/cross_validation_data_cv.csv", index=False)

data_lemmatized = pd.read_csv("data/preprocessed_stories_lemmatized.csv", header=None, names=["story"])

cross_validation_data_lemmatized_cv = get_cross_validation_data(data_lemmatized, no_topics_range, min_dfs, max_dfs, alphas, coherence='c_v')
cross_validation_data_lemmatized_cv.to_csv("data/cross_validation_data_lemmatized_cv.csv", index=False)

Cross Validation:  95%|█████████▌| 36170/38000 [19:58:34<1:18:07,  2.56s/model] 

We need to vectorize the text data to feed it into the LDA model. We use the `CountVectorizer()`. This has a parameter `min_df` which stands for minimum document frequency. For example, if we set it to 0.01, it will ignore words that appear in less than 1% of the documents.

There are also words that appear too often in the documents. `CountVectorizer()` has a parameter `max_df` to ignore the words that appear too freuqently. For example, if we set it to 0.95, it will ignore words that appear in more than 95% of the documents.

We also need to determine the optimal number of topics. For that, we compute the coherence value for different number of topics and choose the one with the highest coherence value.

These three parameters will be calculated and the best value will be chosen using cross validation.

In order to determine the optimal number of topics, we train LDA models with different number of topics and compute their perplexities. We then take the number of topics that gives the lowest perplexity.

For LDA, we set `random_state=100` for reproducibility.

In [9]:
preprocessed_stories_lemmatized = pd.read_csv("data/preprocessed_stories_lemmatized.csv", header=None, names=["story"])
# list of 0.01 to 0.15 with step size of 0.01
min_df_list = [i/100 for i in range(1, 16)]

sets_of_perplexities_lemmatized = [calculate_perplexities(preprocessed_stories_lemmatized["story"], 25, i) for i in min_df_list]

pd.DataFrame(sets_of_perplexities_lemmatized).to_csv("data/perplexities_set_lemmatized.csv", index=False, header=False)

Calculating Perplexities:   0%|          | 0/25 [00:00<?, ?model/s]

Calculating Perplexities:  96%|█████████▌| 24/25 [02:57<00:07,  7.38s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:52<00:07,  7.17s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:46<00:06,  6.95s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:44<00:06,  6.87s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:40<00:06,  6.69s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:33<00:06,  6.39s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:33<00:06,  6.40s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:23<00:05,  5.99s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:22<00:05,  5.93s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:23<00:05,  5.98s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:17<00:05,  5.71s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:11<00:05,  5.49s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:05<00:05,  5.22s/model]

In [10]:
preprocessed_stories = pd.read_csv("data/preprocessed_stories.csv", header=None, names=["story"])

sets_of_perplexities = [calculate_perplexities(preprocessed_stories["story"], 25, i) for i in min_df_list]

pd.DataFrame(sets_of_perplexities).to_csv("data/perplexities_set.csv", index=False, header=False)

Calculating Perplexities:  96%|█████████▌| 24/25 [02:50<00:07,  7.09s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:48<00:07,  7.03s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:42<00:06,  6.78s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:38<00:06,  6.62s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:34<00:06,  6.45s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:30<00:06,  6.25s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:26<00:06,  6.11s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:26<00:06,  6.11s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:17<00:05,  5.73s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:14<00:05,  5.60s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:11<00:05,  5.46s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:05<00:05,  5.21s/model]
Calculating Perplexities:  96%|█████████▌| 24/25 [02:02<00:05,  5.12s/model]

At this point we realize that perplexity is perhaps not the best metric if we are trying to tune `min_df` and `max_df`. These parameters reduce the size of the vocabulary which automatically leads to lower perplexity, meaning that setting `min_df` and `max_df` to very low and high values respectively will give us the lowest perplexity.

Instead, we can use coherence scores. Coherence is a way to measure how interpretable the topics are to humans. There are a couple of ways to compute coherence scores.

The CV Coherence is not recommended by its own author.

UMass Coherence Score calculates how often two words `w1` and `w2` appear in the same document. It then compares this to how often `w1` appears in the corpus.

UCI Coherence Score is based on sliding windows and pointwise mutual information (PMI). It calculates the co-occurrence of words in a window (for example 10 words) and compares it to the probability of the words appearing together in the corpus.

The Word2Vec Coherence Score measures the similarity between words intra-topic and inter-topic. The idea is to maximize the similarity between words in the same topic and minimize the similarity between words in different topics. 