In [1]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

In [32]:
# load data
with open('test-unlabelled.json','r') as f:
    text = f.read()
    text = json.loads(text)
f.close()

test_data = []
for i,t in text.items():
    test_data.append(t['text'])
    
with open('train.json','r') as f:
    text1 = f.read()
    text1 = json.loads(text1)
f.close()

train_data = []
for i,t in text1.items():
    train_data.append(t['text'])

In [49]:
# preprocessing function
def preprocessing(text):
    text = text.replace('{html}',"")
    rem_url = re.sub(r'http\S+', '', text)
    tokens = nltk.word_tokenize(rem_url)
    tokens = [w.lower() for w in tokens]
    rem_stop_words  = [w for w in tokens if len(w)>2 if not w in stopwords.words('english')]
    pos_tag = nltk.pos_tag(rem_stop_words)
    remain_NN = [w for w,pos in pos_tag if pos.startswith('NN')]
    lemma_words=[lemmatizer.lemmatize(w) for w in remain_NN]
    
    return " ".join(lemma_words)

def print_top_words(model, feature_names, n_top_words):
    # print the term with higher weight in the topic
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print(model.components_)

In [51]:
test_set = [preprocessing(text) for text in test_data]
train_set = [preprocessing(text) for text in train_data]

In [52]:
# transfer to BOW format
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=2,stop_words='english')
tf = tf_vectorizer.fit_transform(test_set)
vectorizer_test = CountVectorizer(max_df=0.9, min_df=2,stop_words='english',vocabulary=tf_vectorizer.vocabulary_)
tf_positive = vectorizer_test.fit_transform(train_set) 

# model fitting
n_topic = 10
lda = LatentDirichletAllocation(n_components=n_topic, max_iter=1000,learning_method='batch')
lda.fit(tf)        

n_top_words=20
tf_feature_names = tf_vectorizer.get_feature_names()
# visual keywords in 10 topics
print_top_words(lda, tf_feature_names, n_top_words)

1410
Topic #0:
climate change emission year government energy world carbon country gas people policy fuel action power coal plan report target minister
Topic #1:
year life time child day school music home baby story family son student way car thing week parent church mother
Topic #2:
president state government country war force people report group leader year nation official security law obama time party right isi
Topic #3:
climate change temperature year scientist model data science level sea ice time study co2 water world record warming earth weather
Topic #4:
hotel time home year house work space room city beach people property art view world image artist day building area
Topic #5:
people police city man attack group street resident protester officer station area video official news protest security government men day
Topic #6:
people year time child health company day film woman family service month thing user way case food week cancer state
Topic #7:
plane flight passenger aircra

In [53]:
lda.perplexity(tf)

2114.732864240092

In [66]:
import numpy as np
article_topic = [0] * 10
# perdiction
for doc in tf_positive:
    doc_scores = lda.transform(doc)
    topic = np.argmax(doc_scores)
    article_topic[topic] += 1
print(article_topic)# 0,2,3,8

[575, 6, 49, 498, 6, 0, 8, 1, 20, 5]


In [82]:
with open('dev.json','r') as f:
    text = f.read()
    text = json.loads(text)
f.close()

dev_data = []
dev_label = []
for i,t in text.items():
    dev_data.append(t['text'])
    dev_label.append(t['label'])
    
dev_set = [preprocessing(text) for text in dev_data]
vectorizer_dev = CountVectorizer(max_df=0.9, min_df=2,stop_words='english',vocabulary=tf_vectorizer.vocabulary_)
tf_dev = vectorizer_test.fit_transform(dev_set) 

In [94]:
article_topic = [0] * 10
unrelated_news = []
for i,doc in enumerate(tf):
    doc_scores = lda.transform(doc)
    topic = np.argmax(doc_scores)
    article_topic[topic] += 1
    if topic in (1,4,5,6,7,9):
        unrelated_news.append(i)
print(article_topic)# 0,2,3,8

[235, 85, 223, 107, 68, 80, 183, 72, 201, 156]


In [95]:
print(unrelated_news)

[3, 5, 6, 8, 13, 17, 20, 22, 23, 24, 28, 30, 32, 35, 36, 39, 45, 46, 47, 49, 50, 53, 54, 55, 60, 62, 64, 65, 66, 68, 69, 73, 74, 75, 78, 79, 80, 81, 82, 88, 89, 91, 93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 106, 111, 114, 119, 120, 121, 122, 123, 131, 137, 138, 140, 141, 144, 145, 147, 149, 151, 152, 154, 159, 161, 163, 165, 167, 168, 170, 172, 175, 176, 178, 184, 187, 191, 193, 194, 196, 199, 200, 202, 204, 206, 209, 210, 211, 213, 216, 218, 220, 221, 225, 227, 228, 229, 230, 232, 235, 238, 239, 240, 241, 243, 245, 246, 250, 251, 252, 254, 258, 259, 262, 263, 264, 268, 271, 272, 276, 278, 279, 280, 284, 286, 288, 292, 297, 298, 299, 300, 301, 305, 310, 312, 313, 314, 315, 316, 317, 321, 322, 323, 325, 326, 327, 328, 331, 332, 333, 334, 335, 336, 338, 339, 343, 346, 349, 350, 351, 353, 354, 355, 360, 362, 365, 367, 369, 371, 373, 374, 375, 376, 378, 382, 384, 385, 393, 394, 395, 396, 398, 405, 412, 413, 416, 418, 420, 422, 423, 424, 425, 428, 429, 430, 431, 433, 435, 437, 438, 439, 4