In [2]:
import pandas as pd

data = pd.read_csv('articles1.csv', error_bad_lines=False);
data_text = data[['content']]
data_text = data_text.reset_index()
documents = data_text

In [3]:
len(documents)

50000

In [4]:
documents[:5]

Unnamed: 0,index,content
0,0,WASHINGTON — Congressional Republicans have...
1,1,"After the bullet shells get counted, the blood..."
2,2,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Death may be the great equalizer, but it isn’t..."
4,4,"SEOUL, South Korea — North Korea’s leader, ..."


### Data Preprocessing

In [6]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [7]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Lemmatize example

In [8]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


#### Stemmer Example

In [9]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [10]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [11]:
doc_sample = documents[documents['index'] == 2].values[0][1]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['When', 'Walt', 'Disney’s', '“Bambi”', 'opened', 'in', '1942,', 'critics', 'praised', 'its', 'spare,', 'haunting', 'visual', 'style,', 'vastly', 'different', 'from', 'anything', 'Disney', 'had', 'done', 'before.', 'But', 'what', 'they', 'did', 'not', 'know', 'was', 'that', 'the', 'film’s', 'striking', 'appearance', 'had', 'been', 'created', 'by', 'a', 'Chinese', 'immigrant', 'artist,', 'who', 'took', 'as', 'his', 'inspiration', 'the', 'landscape', 'paintings', 'of', 'the', 'Song', 'dynasty.', 'The', 'extent', 'of', 'his', 'contribution', 'to', '“Bambi,”', 'which', 'remains', 'a', '', '', 'mark', 'for', 'film', 'animation,', 'would', 'not', 'be', 'widely', 'known', 'for', 'decades.', 'Like', 'the', 'film’s', 'title', 'character,', 'the', 'artist,', 'Tyrus', 'Wong,', 'weathered', 'irrevocable', 'separation', 'from', 'his', 'mother', '', '—', '', '', 'and,', 'in', 'the', 'hope', 'of', 'making', 'a', 'life', 'in', 'America,', 'incarceration,', 'isolation', 'and', 'rigo

In [13]:
processed_docs = documents['content'].map(preprocess)

In [14]:
processed_docs[:10]

0    [washington, congression, republican, fear, co...
1    [bullet, shell, count, blood, dri, votiv, cand...
2    [walt, disney, bambi, open, critic, prais, spa...
3    [death, great, equal, necessarili, evenhand, f...
4    [seoul, south, korea, north, korea, leader, sa...
5    [london, queen, elizabeth, battl, cold, week, ...
6    [beij, presid, tsai, taiwan, sharpli, critic, ...
7    [danni, cahil, stand, slight, daze, blizzard, ...
8    [hillari, kerr, founder, digit, media, compani...
9    [angel, muñiz, famili, apart, bronx, paint, an...
Name: content, dtype: object

### Bag of words on the dataset

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [16]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 access
1 acknowledg
2 administr
3 advoc
4 afford
5 alli
6 american
7 anger
8 annual
9 anticip
10 appeal


In [17]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [18]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[2]

[(2, 1),
 (4, 1),
 (6, 3),
 (13, 1),
 (15, 1),
 (25, 1),
 (27, 1),
 (28, 1),
 (33, 1),
 (36, 2),
 (47, 1),
 (51, 1),
 (78, 1),
 (82, 1),
 (95, 1),
 (97, 1),
 (101, 1),
 (103, 1),
 (106, 3),
 (107, 1),
 (108, 3),
 (112, 1),
 (118, 1),
 (120, 1),
 (122, 1),
 (125, 1),
 (126, 1),
 (127, 3),
 (130, 1),
 (132, 2),
 (138, 1),
 (139, 1),
 (144, 3),
 (149, 1),
 (154, 2),
 (158, 1),
 (161, 1),
 (164, 1),
 (169, 1),
 (183, 3),
 (184, 2),
 (189, 3),
 (190, 1),
 (204, 4),
 (209, 1),
 (212, 1),
 (213, 7),
 (218, 1),
 (224, 2),
 (226, 1),
 (235, 6),
 (241, 2),
 (243, 1),
 (248, 1),
 (251, 1),
 (263, 1),
 (267, 2),
 (268, 1),
 (269, 2),
 (274, 4),
 (280, 1),
 (281, 1),
 (294, 1),
 (297, 1),
 (317, 2),
 (320, 1),
 (331, 1),
 (337, 1),
 (345, 2),
 (351, 1),
 (352, 10),
 (359, 1),
 (396, 2),
 (402, 4),
 (404, 1),
 (406, 1),
 (415, 2),
 (419, 3),
 (421, 1),
 (427, 5),
 (429, 1),
 (432, 2),
 (434, 3),
 (442, 1),
 (447, 1),
 (448, 6),
 (455, 1),
 (465, 1),
 (474, 1),
 (481, 4),
 (485, 1),
 (490, 1),
 (497,

In [21]:
bow_doc_2 = bow_corpus[2]

for i in range(len(bow_doc_2)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], 
                                                     dictionary[bow_doc_2[i][0]], 
                                                     bow_doc_2[i][1]))

Word 2 ("administr") appears 1 time.
Word 4 ("afford") appears 1 time.
Word 6 ("american") appears 3 time.
Word 13 ("aspect") appears 1 time.
Word 15 ("author") appears 1 time.
Word 25 ("care") appears 1 time.
Word 27 ("case") appears 1 time.
Word 28 ("caus") appears 1 time.
Word 33 ("choos") appears 1 time.
Word 36 ("come") appears 2 time.
Word 47 ("consid") appears 1 time.
Word 51 ("continu") appears 1 time.
Word 78 ("effect") appears 1 time.
Word 82 ("entir") appears 1 time.
Word 95 ("gain") appears 1 time.
Word 97 ("general") appears 1 time.
Word 101 ("hand") appears 1 time.
Word 103 ("happen") appears 1 time.
Word 106 ("hous") appears 3 time.
Word 107 ("huge") appears 1 time.
Word 108 ("illustr") appears 3 time.
Word 112 ("inaugur") appears 1 time.
Word 118 ("intern") appears 1 time.
Word 120 ("involv") appears 1 time.
Word 122 ("john") appears 1 time.
Word 125 ("lack") appears 1 time.
Word 126 ("late") appears 1 time.
Word 127 ("later") appears 3 time.
Word 130 ("lead") appears 1

### TF-IDF

In [22]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [23]:
corpus_tfidf = tfidf[bow_corpus]

In [24]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.026770545983534913),
 (1, 0.029068893675775774),
 (2, 0.23280474767240217),
 (3, 0.0318152884632238),
 (4, 0.03317955987601938),
 (5, 0.028340257009214845),
 (6, 0.011822358013442068),
 (7, 0.03651313378109811),
 (8, 0.03134841141839632),
 (9, 0.08219950978886159),
 (10, 0.11833316065832819),
 (11, 0.1068948542622624),
 (12, 0.029414800654264807),
 (13, 0.03979115382242141),
 (14, 0.034294964747078195),
 (15, 0.03491551839627701),
 (16, 0.028504937535025775),
 (17, 0.04794320784817166),
 (18, 0.0430859100761559),
 (19, 0.03715714840084949),
 (20, 0.052708727883302366),
 (21, 0.05897435106024678),
 (22, 0.19674943192832461),
 (23, 0.03565688419697066),
 (24, 0.03846151024366051),
 (25, 0.16461063075731858),
 (26, 0.06336353656669406),
 (27, 0.06766757634740564),
 (28, 0.04165525415334854),
 (29, 0.028027620454488433),
 (30, 0.02319742916334767),
 (31, 0.03607625005938099),
 (32, 0.04023225268361473),
 (33, 0.05157570903473344),
 (34, 0.044666235535910065),
 (35, 0.043254521458515

### Running LDA using Bag of Words

In [25]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [26]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"health" + 0.004*"work" + 0.004*"report" + 0.004*"drug" + 0.004*"tell" + 0.004*"state" + 0.004*"know" + 0.003*"studi" + 0.003*"care" + 0.003*"research"
Topic: 1 
Words: 0.027*"trump" + 0.013*"clinton" + 0.011*"presid" + 0.008*"campaign" + 0.007*"democrat" + 0.006*"donald" + 0.006*"report" + 0.006*"elect" + 0.005*"state" + 0.005*"hous"
Topic: 2 
Words: 0.010*"state" + 0.007*"presid" + 0.006*"countri" + 0.006*"unit" + 0.006*"attack" + 0.005*"isi" + 0.005*"govern" + 0.005*"obama" + 0.005*"immigr" + 0.005*"group"
Topic: 3 
Words: 0.016*"trump" + 0.009*"republican" + 0.008*"presid" + 0.007*"senat" + 0.005*"compani" + 0.005*"vote" + 0.005*"state" + 0.005*"democrat" + 0.005*"tesla" + 0.004*"hous"
Topic: 4 
Words: 0.006*"compani" + 0.006*"china" + 0.005*"appl" + 0.005*"state" + 0.004*"trump" + 0.004*"come" + 0.003*"work" + 0.003*"countri" + 0.003*"go" + 0.003*"tell"
Topic: 5 
Words: 0.015*"polic" + 0.010*"offic" + 0.008*"report" + 0.006*"state" + 0.006*"shoot" + 0.005*"t

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [28]:
lda_model_tfidf = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=5)

In [29]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"israel" + 0.004*"palestinian" + 0.004*"isra" + 0.003*"netanyahu" + 0.003*"trump" + 0.003*"jerusalem" + 0.002*"compani" + 0.002*"musk" + 0.002*"settlement" + 0.002*"resolut"
Topic: 1 Word: 0.001*"market" + 0.001*"compani" + 0.001*"water" + 0.001*"china" + 0.001*"world" + 0.001*"health" + 0.001*"polic" + 0.001*"food" + 0.001*"citi" + 0.001*"work"
Topic: 2 Word: 0.005*"turkey" + 0.005*"iran" + 0.004*"turkish" + 0.004*"erdogan" + 0.003*"iranian" + 0.002*"syria" + 0.002*"coup" + 0.002*"syrian" + 0.002*"trump" + 0.002*"istanbul"
Topic: 3 Word: 0.005*"trump" + 0.004*"clinton" + 0.003*"poll" + 0.002*"vote" + 0.002*"voter" + 0.002*"percent" + 0.002*"republican" + 0.002*"black" + 0.002*"hillari" + 0.002*"democrat"
Topic: 4 Word: 0.002*"clinton" + 0.002*"trump" + 0.002*"insur" + 0.002*"obamacar" + 0.001*"email" + 0.001*"film" + 0.001*"court" + 0.001*"health" + 0.001*"depart" + 0.001*"attorney"
Topic: 5 Word: 0.009*"trump" + 0.005*"clinton" + 0.003*"republican" + 0.003*"campa

### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [30]:
processed_docs[2]

['walt',
 'disney',
 'bambi',
 'open',
 'critic',
 'prais',
 'spare',
 'haunt',
 'visual',
 'style',
 'vast',
 'differ',
 'disney',
 'know',
 'film',
 'strike',
 'appear',
 'creat',
 'chines',
 'immigr',
 'artist',
 'take',
 'inspir',
 'landscap',
 'paint',
 'song',
 'dynasti',
 'extent',
 'contribut',
 'bambi',
 'remain',
 'mark',
 'film',
 'anim',
 'wide',
 'know',
 'decad',
 'like',
 'film',
 'titl',
 'charact',
 'artist',
 'tyrus',
 'wong',
 'weather',
 'irrevoc',
 'separ',
 'mother',
 'hope',
 'make',
 'life',
 'america',
 'incarcer',
 'isol',
 'rigor',
 'interrog',
 'child',
 'year',
 'follow',
 'endur',
 'poverti',
 'discrimin',
 'chronic',
 'lack',
 'recognit',
 'work',
 'disney',
 'fine',
 'find',
 'acclaim',
 'wong',
 'die',
 'friday',
 'hollywood',
 'studio',
 'artist',
 'painter',
 'printmak',
 'calligraph',
 'illustr',
 'later',
 'year',
 'maker',
 'fantast',
 'kit',
 'celebr',
 'artist',
 'centuri',
 'margin',
 'long',
 'subject',
 'pass',
 'career',
 'unknown',
 'general

In [31]:
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7446389198303223	 
Topic: 0.005*"go" + 0.005*"know" + 0.005*"work" + 0.004*"think" + 0.004*"come" + 0.004*"tell" + 0.004*"game" + 0.003*"want" + 0.003*"play" + 0.003*"world"

Score: 0.16126777231693268	 
Topic: 0.006*"compani" + 0.006*"china" + 0.005*"appl" + 0.005*"state" + 0.004*"trump" + 0.004*"come" + 0.003*"work" + 0.003*"countri" + 0.003*"go" + 0.003*"tell"

Score: 0.06846994161605835	 
Topic: 0.007*"student" + 0.006*"school" + 0.005*"attack" + 0.005*"north" + 0.004*"countri" + 0.004*"state" + 0.004*"nation" + 0.004*"korea" + 0.003*"tell" + 0.003*"polic"

Score: 0.024962862953543663	 
Topic: 0.010*"state" + 0.007*"presid" + 0.006*"countri" + 0.006*"unit" + 0.006*"attack" + 0.005*"isi" + 0.005*"govern" + 0.005*"obama" + 0.005*"immigr" + 0.005*"group"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [33]:
for index, score in sorted(lda_model_tfidf[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.608140230178833	 
Topic: 0.001*"market" + 0.001*"compani" + 0.001*"water" + 0.001*"china" + 0.001*"world" + 0.001*"health" + 0.001*"polic" + 0.001*"food" + 0.001*"citi" + 0.001*"work"

Score: 0.37228599190711975	 
Topic: 0.002*"clinton" + 0.002*"trump" + 0.002*"insur" + 0.002*"obamacar" + 0.001*"email" + 0.001*"film" + 0.001*"court" + 0.001*"health" + 0.001*"depart" + 0.001*"attorney"

Score: 0.017662575468420982	 
Topic: 0.005*"trump" + 0.004*"clinton" + 0.003*"poll" + 0.002*"vote" + 0.002*"voter" + 0.002*"percent" + 0.002*"republican" + 0.002*"black" + 0.002*"hillari" + 0.002*"democrat"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [34]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.7234691977500916	 Topic: 0.014*"state" + 0.012*"court" + 0.005*"trump" + 0.005*"justic" + 0.005*"govern"
Score: 0.14316491782665253	 Topic: 0.006*"compani" + 0.006*"china" + 0.005*"appl" + 0.005*"state" + 0.004*"trump"
Score: 0.01667333021759987	 Topic: 0.016*"trump" + 0.009*"republican" + 0.008*"presid" + 0.007*"senat" + 0.005*"compani"
Score: 0.016672896221280098	 Topic: 0.010*"state" + 0.007*"presid" + 0.006*"countri" + 0.006*"unit" + 0.006*"attack"
Score: 0.0166714359074831	 Topic: 0.032*"trump" + 0.009*"clinton" + 0.006*"republican" + 0.005*"state" + 0.005*"donald"
Score: 0.016671301797032356	 Topic: 0.007*"student" + 0.006*"school" + 0.005*"attack" + 0.005*"north" + 0.004*"countri"
Score: 0.016670258715748787	 Topic: 0.007*"health" + 0.004*"work" + 0.004*"report" + 0.004*"drug" + 0.004*"tell"
Score: 0.016669033095240593	 Topic: 0.005*"go" + 0.005*"know" + 0.005*"work" + 0.004*"think" + 0.004*"come"
Score: 0.0166690181940794	 Topic: 0.015*"polic" + 0.010*"offic" + 0.008*"