In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk

# Generating embeddings using an LDA model
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from nltk.stem.wordnet import WordNetLemmatizer

train_df = pd.read_csv("../data/train.csv")
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/heinrikchoong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/heinrikchoong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Generating embeddings via LDA model

The below few cells attempt to complete the following few steps
1. Removing any non-alphanumeric characters
2. Lemmatize the words
3. Construct a dictionary of words to generate the Bag-of-words structure
4. Train the LDA model
5. Transform LDA fit back to dataframe for ML modelling

In [2]:
url_regex = r"https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\/([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
non_alphanumeric = r"[^a-zA-Z ]"
symbols = string.punctuation
stopword_and_punctuations = set(nltk.corpus.stopwords.words('english')) | set(string.punctuation) | {''}

In [3]:
corpus = train_df['text'].apply(lambda x: re.sub(url_regex, "", x)).apply(lambda x: re.sub(non_alphanumeric, "", x)).str.lower().to_list()
cleaned = []
target = train_df['target']
lemmatizer = WordNetLemmatizer()
for doc in corpus:
    cleaned.append([lemmatizer.lemmatize(token.lower()) for token in doc.split(" ") if token.lower() not in stopword_and_punctuations])

In [4]:
del train_df

In [5]:
word_dict = Dictionary(cleaned)
word_dict.filter_extremes(no_below=10, no_above=0.5)
corpus = [word_dict.doc2bow(doc) for doc in cleaned]
print('Number of unique tokens: %d' % len(word_dict))
print("--------------------")
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1358
--------------------
Number of documents: 7613


In [6]:
num_topics = 10
chunksize = 3500
passes = 5
iterations = 400
eval_every = None

_ = word_dict[0]
id2word = word_dict.id2token

lda = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [7]:
columns = [f"Topic {i}" for i in range(num_topics)]
values = []

for topic_distr in lda.get_document_topics(corpus):
    distri = []
    i, j = 0, 0
    topic_length = len(topic_distr)
    while i < num_topics:
        if topic_distr[j][0] != i:
            distri.append(0)
        else:
            distri.append(topic_distr[j][1])
            j = min(topic_length - 1, j + 1)
        i += 1
    values.append(distri)

train_df = pd.DataFrame(data=values, columns=columns)
train_df['target'] = target