In [1]:
import pandas as pd

# Prepocessing libraries
import emoji
import re
from langdetect import detect
from spellchecker import SpellChecker
import string
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

def detect_lang(text):
    """
    Detects the language of a story.

    Parameters
    ----------
    text : str
        The story to be processed.
    Returns
    -------
    lang : str
        The language of the story.
    """
    try:
        return detect(text)
    except:
        return 'unknown'
    
stop_words = set(stopwords.words('english'))
punc = set(string.punctuation)
lemma = WordNetLemmatizer()
tokenizer = WordPunctTokenizer()
spellcheck = SpellChecker()

def preprocess(text):
    """
    Preprocesses a story by removing emojis, punctuations, stopwords, spellchecking and lemmatizing the words.

    Parameters
    ----------
    text : str
        The story to be preprocessed.
    Returns
    -------
    processed_text : str
        The preprocessed story.
    """

    # regex to replace all consecutive occurences of punctuations with a single punctuation
    pattern = r'([' + re.escape(''.join(punc)) + r'])\1+'
    text = re.sub(pattern, r'\1', ''.join(text))

    # tokenize the text
    tokens = tokenizer.tokenize(text.lower())
    # remove stopwords, punctuations, emojis, correct and lemmatize the words
    tokens = [spellcheck.correction(token) for token in tokens]
    tokens = [token for token in tokens if emoji.is_emoji(token) == False]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in punc]
    tokens = [lemma.lemmatize(token) for token in tokens if token]

    processed_text = ' '.join(tokens)
    return processed_text

In [2]:
# read the data
file = open("data/stories.csv", "r")
stories_array = []

for line in file:
    stories_array.append(line)

file.close()

data = pd.DataFrame(stories_array, columns=['story'])

Demo: `preprocess()` on a story

In [3]:
print(data['story'][3494])
print(preprocess(data['story'][3494]))

I am an investigative journalist and did a research on the Sugar Mummy scam circus in Singapore. They all operate the same way. No one is what they say they are. I contacted 6 of the agents on Locanto and other sites via WhatsApp and they were all scammers. They might change names but one thing is for 100% sure. You will be scammed! Basically they have a pre-paid phone card with a generic profile photo. They all asure you they are not scammers. After giving them you name, age and civil status they will ask for 300-500 SGDs for a fee. They only accept bank transfer. Then when you have payed this they ask for 1400-1900 SGD for further fees and insurance. They promise you a BMW and a monthly salary of at least 10500 SGD and so on. My conclusion is "DON´T PAY ANYTHING" They are all scammers/fraudsters/liers. Don´t fall for any sweet talk or promises, you will be fooled and no sugar mummy is at the end of the rainbow. No matter who they say they are or that they have lots of clients that re

investigative journalist research sugar mummy scam circus singapore operate way one say contacted 6 agent locate site via whatsapp scammer might change name one thing 100 sure scammed basically pre paid phone card generic profile photo sure scammer giving name age civil status ask 300 500 sod fee accept bank transfer played ask 1400 1900 sad fee insurance promise bow monthly salary least 10500 sad conclusion pay anything scammer fraudster lie fall sweet talk promise fooled sugar mummy end rainbow matter say lot client recommend nothing say true old saying go tell scammer lying lip move investigation complete willing hand spy handling aware super mummy agent scammer


#### Data preprocessing

In [4]:
# add language column
data["language"] = data["story"].apply(detect_lang)

# filter out non-english stories
data = data[data["language"] == "en"]

# drop language column
data = data.drop(columns=["language"])

Test processing on dataframe of 5 stories

In [6]:
from tqdm.auto import tqdm # to show progress bar while iterating over the stories

tqdm.pandas(desc="Preprocessing stories", colour='#ffaaff')
data["story"][0:5].progress_apply(preprocess).to_csv("data/test.csv", index=False, header=False)

Preprocessing stories:   0%|          | 0/5 [00:00<?, ?it/s]

Preprocessing the data and saving it

In [7]:
preprocessed_data = data["story"].progress_apply(preprocess)
preprocessed_data.to_csv("data/processed_stories.csv", index=False, header=False)

Preprocessing stories:   0%|          | 0/3489 [00:00<?, ?it/s]

# Topic Modeling

## LDA


In [8]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv