## Setup

In [None]:
!pip install spacy &> /dev/null
!python -m spacy download en &> /dev/null
!pip install tmtoolkit &> /dev/null

Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn import metrics
import pandas as pd
import spacy
import numpy as np

import re
from pprint import pprint
from tqdm import tqdm
from IPython.core.display import HTML

Settings

In [None]:
use_sub_categories = True
categories = ['alt.atheism', 'sci.space', 'soc.religion.christian', 'talk.politics.guns']
RANDOM_STATE = 42

Load dataset

In [None]:
newsgroups_train_cleaned = fetch_20newsgroups(subset='train', 
                                              remove=('headers', 'footers', 'quotes'), 
                                              categories = categories if use_sub_categories else None,
                                              shuffle=True, 
                                              random_state=RANDOM_STATE)

X = newsgroups_train_cleaned.data
y = newsgroups_train_cleaned.target
target_names = newsgroups_train_cleaned.target_names

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## Prepare data

In [None]:
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [None]:
def preprocess(text):
    text = text.lower()
    text = text.strip()
    text = re.sub('\d', '', text)  # remove numbers 
    text = ' '.join(text.split())  # replace whitespace with single space

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    tokens = [token for token in tokens if len(token) > 1]

    return ' '.join(tokens)

In [None]:
X_clean = [preprocess(doc) for doc in tqdm(X)]

100%|██████████| 2218/2218 [00:30<00:00, 73.32it/s]


Let's fit vectorizer with preprocessed data

In [None]:
vectorizer = CountVectorizer(max_df=0.95, 
                             min_df=2,
                             max_features=1000)

tf = vectorizer.fit_transform(X_clean)
vocabulary = vectorizer.get_feature_names()

## Scikit-learn topic modeling with LDA

### Find best number of topics

In [None]:
def get_lda_model(n_topics=10):  # this is default value
    return LatentDirichletAllocation(n_components=n_topics,
                                     max_iter=5,
                                     learning_method='online',
                                     learning_offset=50.,
                                     random_state=RANDOM_STATE)

Show top words per topic

In [None]:
def get_top_words_per_topics(model, vocabulary, n_top_words=10):
    top_words = [] 
    for topic in model.components_:  # word distribution per topic 
        top_words.append([vocabulary[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
    
    df = pd.DataFrame(top_words)
    df.index = ["Topic " + str(i) for i in range(len(top_words))]
    df.columns = ["Word " + str(i) for i in range(len(top_words[0]))]
    return df

In [None]:
model = get_lda_model(n_topics=3)
model.fit(tf)
get_top_words_per_topics(model, vocabulary)

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,gun,people,right,law,weapon,firearm,state,file,government,think
Topic 1,god,people,know,think,believe,say,jesus,thing,time,come
Topic 2,space,nasa,launch,system,orbit,satellite,year,earth,program,mission


In [None]:
import pickle

# now you can save it to a file
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# and later you can load it
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# now you can save it to a file
with open('vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)

In [None]:
# now you can save it to a file
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
text = 'I support theory that some higher intelligence created this world'
text = preprocess(text)
term_freq = vectorizer.transform([text])

output = model.transform(term_freq)

for topic_idx, _ in enumerate(model.components_):
    print(f"Topic #{topic_idx}: {output[0][topic_idx]}")

[0.06473069335308923, 0.5484120850034417, 0.3868572216434691]
Topic #0: 0.06473069335308923
Topic #1: 0.5484120850034417
Topic #2: 0.3868572216434691
