In [1]:
import json
import pandas as pd
from collections import Counter, OrderedDict

In [2]:
with open('../../Reloaded/Data/topics_d.csv', 'r') as f: d1k = json.loads(f.read())
d1k_topics = [i[0][0]+' '+i[0][1] for i in d1k]
d1k_frequency = [i[1] for i in d1k]
d1k_map = {}
for topic, frequency in zip(d1k_topics, d1k_frequency): d1k_map[topic] = frequency

In [3]:
with open('../../Reloaded/Data/topics_b.csv', 'r') as f: b1k = json.loads(f.read())
b1k_topics = [i[0][0]+' '+i[0][1] for i in b1k]
b1k_frequency = [i[1] for i in b1k]
b1k_map = {}
for topic, frequency in zip(b1k_topics, b1k_frequency): b1k_map[topic] = frequency

In [4]:
with open('../../Reloaded/Data/topics_n.csv', 'r') as f: n1k = json.loads(f.read())
n1k_topics = [i[0][0]+' '+i[0][1] for i in n1k]
n1k_frequency = [i[1] for i in n1k]
n1k_map = {}
for topic, frequency in zip(n1k_topics, n1k_frequency): n1k_map[topic] = frequency

In [5]:
all_topics = d1k_topics+b1k_topics+n1k_topics
topic_set = set(all_topics)
len(all_topics), len(topic_set)

(3000, 2035)

## Media Collection

In [6]:
df_media = pd.DataFrame.from_dict(dict(Counter(all_topics)), orient='index')
df_media = df_media.reset_index()
df_media.columns = ['topic', 'media_count']
df_media

Unnamed: 0,topic,media_count
0,anonymous coward,1
1,united state,3
2,confirmed case,3
3,dont know,3
4,public health,3
...,...,...
2030,come together,1
2031,jump case,1
2032,make change,1
2033,military base,1


## Frequency Collection

In [7]:
frequency_collection = {}
for topic in topic_set:
    frequency_collection[topic] = 0

In [8]:
# Update Discussion Frequencies
for topic, frequency in d1k_map.items():
    frequency_collection[topic] = frequency_collection[topic]+d1k_map[topic]
# Update Blog Frequencies
for topic, frequency in b1k_map.items():
    frequency_collection[topic] = frequency_collection[topic]+b1k_map[topic]
# Update News Frequencies  
for topic, frequency in n1k_map.items():
    frequency_collection[topic] = frequency_collection[topic]+n1k_map[topic]

In [9]:
frequency_collection = dict(sorted(frequency_collection.items(), key=lambda item: item[1], reverse=True))

In [15]:
df_frequency = pd.DataFrame.from_dict(frequency_collection, orient='index')
df_frequency = df_frequency.reset_index()
df_frequency.columns = ['topic', 'topic_frequency']
df_frequency.head(10)


Unnamed: 0,topic,topic_frequency
0,test positive,45586
1,confirmed case,41265
2,public health,36962
3,social distancing,36528
4,stay home,35789
5,anonymous coward,33564
6,united state,32177
7,cruise ship,31352
8,death toll,30827
9,south korea,29485


In [17]:
all_topics = df_media.merge(df_frequency, on='topic')
all_topics.head(10)

Unnamed: 0,topic,media_count,topic_frequency
0,anonymous coward,1,33564
1,united state,3,32177
2,confirmed case,3,41265
3,dont know,3,29353
4,public health,3,36962
5,stay home,3,35789
6,last week,3,22400
7,tested positive,3,28351
8,many people,1,20634
9,social distancing,3,36528


In [18]:
common_topics = all_topics[all_topics['media_count']==3].sort_values(by='topic_frequency', ascending=False)
common_topics.head(20)

Unnamed: 0,topic,media_count,topic_frequency
118,test positive,3,45586
2,confirmed case,3,41265
4,public health,3,36962
9,social distancing,3,36528
5,stay home,3,35789
1,united state,3,32177
46,cruise ship,3,31352
56,death toll,3,30827
10,south korea,3,29485
3,dont know,3,29353


In [19]:
different_topics = all_topics[all_topics['media_count']==1].sort_values(by='topic_frequency', ascending=False)
different_topics.head(20)

Unnamed: 0,topic,media_count,topic_frequency
0,anonymous coward,1,33564
8,many people,1,20634
12,originally posted,1,18093
13,expected th,1,17999
16,quote quote,1,16673
19,dont think,1,15721
20,report copyright,1,15387
21,copyright violation,1,15372
30,quoting anonymous,1,13840
32,im sure,1,13636


In [14]:
for t, f in zip(different_topics['topic'], different_topics['topic_frequency']):
    print (t,f)

anonymous coward 33564
many people 20634
originally posted 18093
expected th 17999
quote quote 16673
dont think 15721
report copyright 15387
copyright violation 15372
quoting anonymous 13840
im sure 13636
disease control 11424
week ago 11349
even though 11207
make sure 10551
lot people 10301
center disease 9530
better expected 9523
march pm 8815
worse expected 8742
would like 8588
fatality rate 8453
people died 8439
name jesus 8131
im going 8056
people get 7780
day ago 7765
control prevention 7750
get sick 7598
number people 7437
infected people 7266
pretty much 7000
people dont 6844
hour ago 6797
make sense 6775
dont get 6726
close contact 6724
one thing 6548
common cold 6483
last night 6439
long time 6331
dont need 6322
sound like 6260
every year 6255
said would 6215
something like 6186
seasonal flu 6147
like share 6083
month ago 6013
today pm 5818
trump said 5676
photo afp 5666
seems like 5662
official said 5495
long term 5489
thing like 5452
said statement 5440
last day 5308
intens

issue stayathome 586
country music 583
stem spread 581
official announce 581
se asia 580
growth forecast 578
story strait 578
understanding anxiety 577
stimulus deal 577
ibtimes india 577
store due 577
pga tour 576
case surpass 575
put hold 575
market fall 575
job loss 575
ship quarantined 575
online amid 574
close bar 574
output cut 573
jerusalem post 573
aid bill 573
possible exposure 571
transcript motley 570
fed slash 570
palm spring 570
newspaper dawncom 570
announces case 569
cancelled amid 568
bring back 566
drivethrough testing 565
fool australia 565
united way 563
europe travel 562
europe day 562
sensex nifty 562
confirms st 562
hot spot 562
case bringing 560
suspend operation 554
nation china 554
city hall 554
flight attendant 553
official warn 553
fear grip 553
show cancelled 553
west virginia 550
tax relief 550
trade show 549
united airline 548
oil crash 547
top global 546
domestic violence 546
indian express 546
trump travel 546
health chief 546
gdp growth 545
mask amid 54