# ISA 414 - Managing Big Data
## Lecture 16 – Text Mining (Part III)

#### Slide 12

In [1]:
import pandas

raw_data = pandas.read_csv("abcnews-date-text.csv")
corpus = raw_data["headline_text"]
corpus = corpus[0:10000]

#### Slide 13

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# creating a transformation
count_vect = CountVectorizer(stop_words='english')

# fitting the transformation into our corpus
dtm = count_vect.fit_transform(corpus)

# note that our DTM has 10,000 observations (documents) and 9,820 columns (words)
print(dtm)

  (0, 196)	1
  (0, 2456)	1
  (0, 1975)	1
  (0, 1310)	1
  (0, 5110)	1
  (1, 267)	1
  (1, 9681)	1
  (1, 801)	1
  (1, 2476)	1
  (2, 1459)	1
  (2, 4525)	1
  (2, 6786)	1
  (2, 8533)	1
  (3, 390)	1
  (3, 5970)	1
  (3, 8292)	1
  (3, 764)	1
  (3, 8441)	1
  (3, 6309)	1
  (3, 7404)	1
  (4, 390)	1
  (4, 5970)	1
  (4, 8441)	1
  (4, 334)	1
  (4, 768)	1
  :	:
  (9995, 389)	1
  (9995, 7085)	1
  (9996, 6460)	1
  (9996, 9293)	1
  (9996, 3886)	1
  (9996, 5130)	1
  (9996, 3928)	1
  (9996, 9369)	1
  (9996, 5366)	1
  (9997, 6058)	1
  (9997, 9369)	1
  (9997, 5045)	1
  (9997, 5823)	1
  (9997, 1882)	1
  (9997, 2818)	1
  (9998, 8760)	1
  (9998, 9021)	1
  (9998, 8595)	1
  (9998, 9527)	1
  (9998, 8206)	1
  (9999, 9525)	1
  (9999, 9700)	1
  (9999, 8467)	1
  (9999, 5728)	1
  (9999, 3737)	1


#### Slide 14

In [3]:
from sklearn.decomposition import LatentDirichletAllocation

# initializing LDA with 5; random_state ensures that we all get similiar results since the random seed is the same 
LDA = LatentDirichletAllocation(n_components=5, random_state = 40)

# fitting LDA to our DTM (this operation might take a while)
LDA.fit(dtm)

LatentDirichletAllocation(n_components=5, random_state=40)

#### Slide 15

In [4]:
first_topic  = LDA.components_[0]
second_topic = LDA.components_[1]
third_topic  = LDA.components_[2]
fourth_topic = LDA.components_[3]
fifth_topic  = LDA.components_[4]


print(first_topic)

[0.20044801 6.60006742 0.20357285 ... 0.20000656 0.20000597 1.16492059]


#### Slide 16

In [5]:
# argsort returns the indices of the sorted array, instead of values
top10_index_1st_topic = first_topic.argsort()[-10:]
top10_index_2nd_topic = second_topic.argsort()[-10:]
top10_index_3rd_topic = third_topic.argsort()[-10:]
top10_index_4th_topic = fourth_topic.argsort()[-10:]
top10_index_5th_topic = fifth_topic.argsort()[-10:]

print(top10_index_1st_topic)

[4881 2242 3600 1806  848 2195 6460  540 5879 9483]


#### Slide 17

In [6]:
all_words = count_vect.get_feature_names()

top10_words_1st_topic = [all_words[i] for i in top10_index_1st_topic] 
top10_words_2nd_topic = [all_words[i] for i in top10_index_2nd_topic] 
top10_words_3rd_topic = [all_words[i] for i in top10_index_3rd_topic] 
top10_words_4th_topic = [all_words[i] for i in top10_index_4th_topic] 
top10_words_5th_topic = [all_words[i] for i in top10_index_5th_topic] 



#### Slide 18: note that the last element in each array is the most popular word in the topic

In [7]:
print("First topic: ")
print(top10_words_1st_topic)

print("Second topic: ")
print(top10_words_2nd_topic)

print("Third topic: ")
print(top10_words_3rd_topic)

print("Fourth topic: ")
print(top10_words_4th_topic)

print("Fifth topic: ")
print(top10_words_5th_topic)

First topic: 
['killed', 'crash', 'forces', 'claims', 'baghdad', 'council', 'plan', 'anti', 'new', 'war']
Second topic: 
['anti', 'south', 'greens', 'denies', 'iraqi', 'world', 'cup', 'mp', 'iraq', 'war']
Third topic: 
['plan', 'death', 'work', 'vic', 'water', 'korea', 'north', 'says', 'govt', 'iraq']
Fourth topic: 
['qld', 'govt', 'nsw', 'probe', 'murder', 'charged', 'iraq', 'court', 'police', 'man']
Fifth topic: 
['new', 'dead', 'oil', 'boost', 'council', 'iraqi', 'sars', 'security', 'iraq', 'report']


#### Slide 18: make sure you run the following command in the terminal: *pip install pyldavis*

In [9]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(LDA, dtm, count_vect, R = 10)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


#### Slide 26

In [10]:
topic_values = LDA.transform(dtm)
print(topic_values)

[[0.03351358 0.0351082  0.52972728 0.03365488 0.36799605]
 [0.04024302 0.04025797 0.04002988 0.4481584  0.43131073]
 [0.04038569 0.04120298 0.04070665 0.0400949  0.83760979]
 ...
 [0.02928706 0.0290085  0.88376206 0.02898407 0.02895831]
 [0.03403925 0.03354175 0.8653983  0.03357617 0.03344453]
 [0.03354834 0.86412169 0.0348293  0.03396242 0.03353826]]


#### Slide 27

In [11]:
# obtaining the index of the max value per observation
# axis = 1 means the max value for 'columns'
max_topic_values = topic_values.argmax(axis=1)

# conveting values to string
max_topic_values = max_topic_values.astype("str")

#### Slide 28: recall that the first index in Python is 0

In [12]:
max_topic_values[max_topic_values == "0"] = "Rally"           # first topic of LDA
max_topic_values[max_topic_values == "1"] = "?"               # second topic of LDA 
max_topic_values[max_topic_values == "2"] = "International"   # third topic of LDA
max_topic_values[max_topic_values == "3"] = "Trial"           # fourth topic of LDA
max_topic_values[max_topic_values == "4"] = "Security"        # fifth topic of LDA

#### Slide 29

In [13]:
data_w_topic = pandas.DataFrame()

# retrieving the first 10,000 texts (i.e., 10,000 values from the second column)
data_w_topic["Text"]  = raw_data.iloc[0:10000,1]

# adding topics to the original data
data_w_topic["Topic"] = max_topic_values
data_w_topic

Unnamed: 0,Text,Topic
0,aba decides against community broadcasting lic...,International
1,act fire witnesses must be aware of defamation,Trial
2,a g calls for infrastructure protection summit,Security
3,air nz staff in aust strike for pay rise,Security
4,air nz strike to affect australian travellers,Security
...,...,...
9995,vic govt plan aims to reduce water use,International
9996,vic govt urged to green light marina plan,International
9997,vic nats leave door open on coalition,International
9998,waterfall survivors tell of train speeding before,International


#### Slide 30

In [None]:
# creating a DTM with the last two rows of raw_data
dtm = count_vect.transform(raw_data["headline_text"][-2:])

# Applying the LDA model to the DTM
topic_values = LDA.transform(dtm)

# obtaining the index of the max value per observation
# axis = 1 means the max value for 'columns'
max_topic_values = topic_values.argmax(axis=1)

# conveting values to string
max_topic_values = max_topic_values.astype("str")

# renaming the topics
max_topic_values[max_topic_values == "0"] = "Rally"           # first topic of LDA
max_topic_values[max_topic_values == "1"] = "?"               # second topic of LDA 
max_topic_values[max_topic_values == "2"] = "International"   # third topic of LDA
max_topic_values[max_topic_values == "3"] = "Trial"           # fourth topic of LDA
max_topic_values[max_topic_values == "4"] = "Security"        # fifth topic of LDA

# printing the topics assigned to the two documents
print(max_topic_values)
