In [44]:
#https://github.com/gregversteeg/corex_topic

In [45]:
import nltk
import pandas as pd

In [46]:
from corextopic import corextopic as ct

# CorEx on Positive Articles

In [47]:
import pandas as pd

wri = pd.read_csv("wri_coref_110319.csv",index_col=0)

wri = wri[wri['class']=='positive']
wri.reset_index(drop=True, inplace=True)

print(wri.head())
print(wri.info())

      class                                               text  \
0  positive  Andhra Pradesh Chief Minister N Chandrababu Na...   
1  positive  This story is from January 16, 2018\r\r\n\r\r\...   
2  positive  A crowd angered over what they believed was th...   
3  positive  After having spent over 12 hours dousing flame...   
4  positive  The impact of drought on women farmers remains...   

                                          text_coref  \
0  Andhra Pradesh Chief Minister N Chandrababu Na...   
1  This story is from January 16, 2018\r\nBHOPAL:...   
2  A crowd angered over what A crowd believed was...   
3  After having spent over 12 hours dousing flame...   
4  The impact of drought on women farmers remains...   

                                               title  
0  Maoists using bauxite mining as pretext to kil...  
1              In Madhya Pradesh, even the dead talk  
2  Two killed in violence over cow slaughter in n...  
3  Army to take up Bellandur lake fire issue w

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=1,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False,
    strip_accents = 'unicode',
    stop_words = 'english'
)

vectorizer = vectorizer.fit(wri['text_coref'])
tfidf = vectorizer.transform(wri['text_coref'])
vocab = vectorizer.get_feature_names()
print(len(vocab))
print(vocab)


In [49]:
from corextopic import corextopic as ct

In [50]:
TOPICS = 7
NBR_OF_WORDS = 3

In [51]:
anchors = []
model = ct.Corex(n_hidden=TOPICS, seed=42)
model = model.fit(
    tfidf,
    words=vocab
)

## Keywords for Topic Anchors
['land', 'acre','hectares', 'acquisition', 'land acquisition', 'agricultural', 'acres', 'degradation','landslides','property','resettlement'],

['farmer', 'farming', 'agricultural', 'produce', 'crop', 'crops', 'agrarian', 'farms','farm','field','fields','soil','sugarcane','vegetables','farmers','agriculture','tractor','prices crops', 'debt','quota','food','fruits','livestock','cow','wheat','harvest','harvesting','horticulture','loan','loans','milk','paddy','rice','plant','plants','potatoes','potato'],

['mining', 'coal', 'miner', 'miners','sand mining', 'sand','bauxite','iron ore','limestone','manganese ore','granite'],

['forest','forests', 'forest department', 'reserve', 'forest officials','forestry'],

['animal','leopard','leopards', 'animals', 'wildlife', 'tiger', 'attacked', 'slaughter', 'lion','lions', 'threat', 'tigress', 'bear','birds','cat','cattle','crocodile','elephant','elephants','pangolin','pangolins','species'],

['drought', 'droughts','monsoon', 'rain','rains','rainfall','disaster'],

['water', 'irrigation', 'monsoon', 'rain', 'flood', 'floods', 'flooded', 'climate change','climate','dam','dams','drinking']



In [52]:
# Anchors designed to nudge the model towards measuring specific genres

anchors = [
    ['land','resettlement','degradation','plot'],
    ['farm','Farmers','crop','agriculture','crops','agrarian','farmer','farmers''cows','tractor','acre','fields','livestock','harvest','harvesting','potato','sugarcane','paddy','rice','milk'],
    ['mining', 'coal', 'miner', 'miners','sand mining', 'sand','bauxite','iron ore','limestone','manganese ore','granite'],
    ['forest','deforestation','trees'],
    ['animal','attacked','leopard','leopards','tiger','tigress','crocodile'],
    ['drought','rain','climate change'],
    ['water','dams','irrigation','flood','drinking']   
   
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=TOPICS, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=100 # Tell the model how much it should rely on the anchors
)

In [53]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=NBR_OF_WORDS)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: land, resettlement, degradation
Topic #2: crops, farm, agriculture
Topic #3: mining, coal, sand
Topic #4: forest, trees, deforestation
Topic #5: animal, attacked, tiger
Topic #6: drought, climate change, rain
Topic #7: water, drinking, dams


In [54]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(TOPICS)]
).astype(float)
topic_df.index = wri.index
wri = pd.concat([wri, topic_df], axis=1)

In [55]:
for i in range(TOPICS):
    column='topic_{}'.format(i+1)
    print(wri[column].value_counts(normalize=True))

0.0    0.65286
1.0    0.34714
Name: topic_1, dtype: float64
0.0    0.686391
1.0    0.313609
Name: topic_2, dtype: float64
0.0    0.767258
1.0    0.232742
Name: topic_3, dtype: float64
0.0    0.755424
1.0    0.244576
Name: topic_4, dtype: float64
0.0    0.779093
1.0    0.220907
Name: topic_5, dtype: float64
0.0    0.751479
1.0    0.248521
Name: topic_6, dtype: float64
0.0    0.613412
1.0    0.386588
Name: topic_7, dtype: float64


In [56]:
#Topic Flags
wri['topic']=wri['topic_1']+wri['topic_2']+wri['topic_3']+wri['topic_4']+wri['topic_5']+wri['topic_6']+wri['topic_7']
wri['topic'].value_counts(normalize=True)

2.0    0.335306
1.0    0.333333
3.0    0.147929
0.0    0.065089
4.0    0.061144
5.0    0.041420
6.0    0.015779
Name: topic, dtype: float64

In [59]:
MisTagged = wri[wri['topic']==0]
MisTagged.to_csv("MisTagged.csv")
print(MisTagged)
print(MisTagged.info())


        class                                               text  \
5    positive  After 20 years of providing Manitoba livestock...   
23   positive  The fire was restricted to one section of the ...   
71   positive  New Delhi, Oct 2 (IANS) Left parties on Tuesda...   
74   positive  Updated: Aug 29, 2019 21:05 IST\r\r\r\nIndia r...   
82   positive  This refers to ÃÂ¢ÃÂÃÂFarmers march in De...   
90   positive  Chinese troops intrude into Arunachal with roa...   
116  positive  farmersÃÂ¢ÃÂÃÂ protest\r\r\r\nNGO Nation\...   
129  positive  The 'Kisan Kranti Padyatra', which was started...   
141  positive  Accept the updated privacy & cookie policy\r\r...   
160  positive  Advertisement\r\r\n\r\r\nBellandur Lake Group,...   
238  positive  Nagpur, MAHARASHTRA ÃÂ¢ÃÂÃÂ Over one lakh...   
243  positive  A plea alleging that rainwater harvesting syst...   
257  positive  Prices for farmed vannamei shrimp should be fa...   
274  positive  Telangana IT and Industries Minis