In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import utils, nlp_utils
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords

from datetime import datetime

nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Topical Modeling - Wall Street Market

## Data Processing

In [2]:
wallstreet = pd.read_csv('data/wallstreet.csv')
wall_nlp = wallstreet[['subforum', 'contentWithHTMLTag']].copy()

In [3]:
wall_nlp['cleaned_content'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_parse_text(x))
wall_nlp['lemmatized_tokens'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_tokenize_lemmatize(x))

--- 
# LDA

In [12]:
def convert_to_bow_and_fit_lda_model(text_dataset, max_feats, freq_thresh, n_topics, learning_method, max_iter, random_state=3):
    '''
    Convert text into a vector representation, i.e. Bag of Words.
    
    Args:
        text_dataset(arr) : An array comprised of numerous texts.
        max_feats (int) : Number of words to limit the bag-of-words.
        freq-thresh (float) : Words that have a document frequency greater than the selected amount will be ignored.
    Return:
        text_transformed (arr) : Vectorized texted.
    
    '''
    
    vect = CountVectorizer(max_features=max_feats, max_df=freq_thresh)
    text_transformed = vect.fit_transform(text_dataset)

    lda = LatentDirichletAllocation(n_components=n_topics, learning_method=learning_method, max_iter=max_iter, random_state=random_state, n_jobs=-1)
    doc_topics = lda.fit_transform(text_transformed)
    
    return doc_topics, lda, vect
    
#def summarize_lca_topics(lda_model)
    

In [13]:
cleaned_content = wall_nlp['cleaned_content'].values
doc_topics, lda_model, vect = vectorize_data_and_fit_lda_model(cleaned_content, 10000, 0.15, 10, 'batch', 25, 3)

sorted_components = np.argsort(lda_model.components_, axis=1)[:, ::-1]
feat_names = np.array(vect.get_feature_names())

In [28]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        top_words_arr = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("\nTopic #{}:".format(topic_idx))
        print(" ".join(top_words_arr))

In [30]:
print_topics(lda_model, vect, 10)


Topic #0:
contact please allowed need external here am how pm any

Topic #1:
use store key using card infodesk infodesksorcvsgq leaguemodegrams 9a7ae0b905 address

Topic #2:
chat day room bank one cashout fraud wallst3gi4a5wtn4 log change

Topic #3:
bank account credit card need fullz accounts uk drop cards

Topic #4:
he they as like was what some there about know

Topic #5:
cashout bank guides 10 records ssn link extras expectus fraud

Topic #6:
wallst4qihu6lvsa up new good thanks code re boneskoopa code48k see

Topic #7:
we our pgp quality an begin hash weed high signature

Topic #8:
order we use as was now at mirror when they

Topic #9:
id php x7bwsmcore5fmx56 viewtopic ref signup rules phished dreadditevelidot 14060don


In [25]:
len(lda_model.components_[0])

10000