In [1]:
from collections import Counter
from collections import defaultdict
import networkx as nx

import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

In [2]:
%time questions = pd.read_csv('./question_train_word.csv')

CPU times: user 17.1 s, sys: 3.86 s, total: 20.9 s
Wall time: 24.6 s


In [3]:
print(questions.columns)
%time questions_topics = questions.topics.apply(lambda s: s.split(','))
%time questions_titles = questions.title.astype('U').apply(lambda s: s.split(','))

Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'title', 'description', 'topics'], dtype='object')
CPU times: user 3.78 s, sys: 536 ms, total: 4.32 s
Wall time: 5.36 s
CPU times: user 8.51 s, sys: 1.84 s, total: 10.3 s
Wall time: 14 s


## co-occurence

- filter stopwords: top 10 and less then 20 times (20/1000)
- count:
    - word in title
    - word and tag co-occurence
- calculate probability: P(tag|word)


In [4]:
# stop words
%time title_words = [w for ws in questions_titles for w in ws]
%time title_words_count = Counter(title_words)
title_words_count.most_common(10)

CPU times: user 1.53 s, sys: 1.22 s, total: 2.75 s
Wall time: 3.67 s
CPU times: user 9.35 s, sys: 1.47 s, total: 10.8 s
Wall time: 11.6 s


[('w111', 2976600),
 ('w6', 1921785),
 ('w11', 1474827),
 ('w109', 552983),
 ('w54', 547054),
 ('w25', 526597),
 ('w471', 433385),
 ('w4016', 419135),
 ('w1110', 389917),
 ('w10147', 351932)]

In [5]:
%time title_words_most_common = title_words_count.most_common()
stopwords = [w for w, i in title_words_most_common if i < 20 or i > 0.25*len(questions_titles)]
title_words_set = set(title_words)
active_words = title_words_set.difference(set(stopwords))
print(len(stopwords))
print(len(questions_titles))
print(len(title_words_set))

CPU times: user 1.02 s, sys: 39.3 ms, total: 1.06 s
Wall time: 1.1 s
264274
2999967
324960


In [7]:
W = nx.Graph()

length = len(questions_titles)

for i in tqdm_notebook(range(length)):
    for word in questions_titles[i]:
        if word not in active_words:
            continue
        for tag in questions_topics[i]:
            if not W.has_edge(word, tag):
                W.add_edge(word, tag, weight=1)
            else:
                W.edge[word][tag]['weight'] += 1

2619043/|/ 87%|| 2619043/2999967 [14:14<02:04, 3063.90it/s]


In [15]:
co_word_tag = defaultdict(lambda: defaultdict(lambda: 0))
length = len(questions_titles)
# length = 10000

for i in tqdm_notebook(range(length)):
    for word in questions_titles[i]:
        if word not in active_words:
            continue
        for tag in questions_topics[i]:
            co_word_tag[word][tag] += 1




In [16]:
len(co_word_tag[list(co_word_tag.keys())[999]])

420

In [17]:
%time topic_set = set(t for ts in questions_topics for t in ts)
len(topic_set)

CPU times: user 1.25 s, sys: 949 ms, total: 2.2 s
Wall time: 2.37 s


1999

In [27]:
def predict_title(title):
    result = defaultdict(lambda: 1)
    for tag in topic_set:
        for word in title:
            if not word in active_words:
                continue
            result[tag] *= (co_word_tag[word][tag] + 1) / (title_words_count[word] + 1999)
#             result[tag] *= co_word_tag[word][tag]
    return Counter(result)

print(predict_title(questions_titles[0]).most_common(20))
print(questions_topics[0])
print(questions_titles[0])
co_word_tag['w8646']['3195914392210930723']

[('7476760589625268543', 3.3031765956311026e-27), ('4697014490911193675', 7.67723547807016e-29), ('-4653836020042332281', 2.0216006112779725e-29), ('-7046289575185911002', 5.013292381596791e-30), ('3738968195649774859', 1.7339574697751384e-30), ('-7129272008741138808', 4.79868050294047e-31), ('-4931965624608608932', 3.347435996405279e-31), ('-8175048003539471998', 2.1644555518032123e-31), ('-5932391056759866388', 1.1841251730638548e-31), ('2787171473654490487', 5.393021386054086e-32), ('-8377411942628634656', 1.675748846949536e-32), ('-3388534223137478077', 4.394668380326251e-33), ('2587540952280802350', 3.746131372331043e-33), ('-3517637179126242000', 2.4580596105544916e-33), ('-8132909213241034354', 1.7989463997057147e-33), ('-6748914495015758455', 1.425572397469604e-33), ('9069451131871918127', 1.3637803901444563e-33), ('8697050490955193511', 1.2403908100251155e-33), ('-8320373689946067773', 2.805999802946622e-34), ('-5872443091340192918', 1.7818921851050028e-34)]
['7739004195693774

169

In [25]:
length = 1000
result = []
for i in tqdm_notebook(range(length)):
    result.append([i for i, v in predict_title(questions_titles[i]).most_common(5)])




In [26]:
from evaluate import evaluate
evaluate(zip(result, questions_topics[:1000]))

(0.12217566903841935, 0.3976054258797025, 0.17637059073523162, 415, 2353)