In [1]:
# "Topic Modeling with Scikit Learn"
# https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
# http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd

In [2]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx)) 
        print( " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [3]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
#twitterDf = pd.read_csv('C:/Apps/Anaconda3/Python_workGH/Twitter Archive/tweets.csv') # my twitter archive downloaded
#documents = list(twitterDf['text'])

In [4]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
excl_words = set(['amp','https','rt','fom'])
excludeWords =stop.union(excl_words)
excludeChars = set(string.punctuation).union(set(['‘','’'])).difference(set(['@'])) 
lemma = WordNetLemmatizer()
def clean(doc):
    #stop_free = " ".join(word for word in doc.lower().split() if ((word not in excludeWords)&(word[0]!='@')) )
    #punc_free = ''.join(ch for ch in stop_free if ch not in excludeChars)
    #normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    
    punc_free = ''.join(ch for ch in doc.lower() if ch not in excludeChars)
    stop_free = " ".join(word for word in punc_free.split() if (word not in excludeWords)&(word[0]!='@')&('htt' not in word))
    #normalized = " ".join(lemma.lemmatize(word) for word in stop_free.split())
    
    return stop_free # normalized

In [5]:
documents = [clean(doc) for doc in documents]  

In [6]:
print("Number of documents: "+str(len(documents)))
documents[0]

Number of documents: 11314


'well im sure story nad seem biased disagree statement us media ruin israels reputation rediculous us media proisraeli media world lived europe realize incidences one described letter occured us media whole seem try ignore us subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt go away look jews treating races got power unfortunate'

In [7]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf.shape

(11314, 1000)

In [8]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
tf.shape

(11314, 1000)

In [14]:
no_topics = 7
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
dont people think like know im good time right make
Topic 1:
windows file use files program using window card dos problem
Topic 2:
god jesus bible believe christ faith christian christians gods church
Topic 3:
thanks email advance know looking im hi info address information
Topic 4:
drive scsi drives hard disk controller ide floppy mac card
Topic 5:
key chip encryption clipper keys government use algorithm phone chips
Topic 6:
game team games year players season play hockey win teams
Topic 0:
game team 10 year games 25 play 12 15 season
Topic 1:
god people jesus believe true say bible jews evidence israel
Topic 2:
new car years armenian said armenians turkish time people went
Topic 3:
maxaxaxaxaxaxaxaxaxaxaxaxaxaxax key government encryption public use chip security law keys
Topic 4:
file available program information space use data email list db
Topic 5:
dont people think like know im right good say time
Topic 6:
use windows thanks drive like know problem using card im
