In [1]:
from collections import Counter
from collections import defaultdict
import networkx as nx

In [2]:
import pandas as pd
import numpy as np

In [3]:
%time questions = pd.read_csv('./question_train_word.csv')

CPU times: user 16.2 s, sys: 2.61 s, total: 18.8 s
Wall time: 19.2 s


In [4]:
print(questions.columns)
%time questions_topics = questions.topics.apply(lambda s: s.split(','))
%time questions_titles = questions.title.astype('U').apply(lambda s: s.split(','))

Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'title', 'description', 'topics'], dtype='object')
CPU times: user 3.93 s, sys: 468 ms, total: 4.4 s
Wall time: 4.52 s
CPU times: user 8.46 s, sys: 2.11 s, total: 10.6 s
Wall time: 13.2 s


In [11]:
# from collections import defaultdict
# from tqdm import tqdm_notebook
# # from multiprocessing import 

# total, common, usefulness = [defaultdict(int) for _ in range(3)]

# for i in tqdm_notebook(range(len(questions_titles))):
#     for word in questions_titles[i]:
#         total[word] += 1
#         if word in questions_topics[i]:
#             common[word] += 1

# for word in total:
#     usefulness[word] = common[word] / total[word]





In [5]:
colnams = ['id', 'p_ids', 'c', 'w', 'd_c', 'd_w']
topics = pd.read_csv('./topic_info.txt', sep='\t', names=colnams)

In [6]:
topics_list = [t.split(',') for t in topics.w.astype('U')]
topics['w_len'] = topics.w.astype('U').apply(lambda x: len(x.split(',')))

#### How to deal with topics with multiple words?

In [7]:
topics.w_len.value_counts()

1     1704
2      116
3       79
4       45
5       27
7       10
6        9
8        4
10       2
9        2
11       1
Name: w_len, dtype: int64

In [8]:
print(questions_topics[0])
topics.set_index(['id'], inplace=True)

['7739004195693774975', '3738968195649774859']


In [9]:
topic_id_map = {str(i): topics.loc[i].w for i in topics.index}
topic_id_map['7739004195693774975']

'w103'

In [10]:
%time questions_topics_w = questions.topics.apply(lambda s: list(map(lambda i: topic_id_map[i], s.split(','))))

CPU times: user 8.17 s, sys: 2.73 s, total: 10.9 s
Wall time: 11.6 s


In [11]:
questions_topics_w[0]

['w103', 'w24']

#### using redis and multiprocessing

but the speed is disappointing.

In [31]:
from collections import defaultdict
from tqdm import tqdm_notebook
from redis import Redis
from multiprocessing import Pool

r = Redis(decode_responses=True)
TOTAL = 'total_{}'
COMMON = 'common_{}'

# clear old data
total_keys = r.keys(TOTAL.format('*'))
if total_keys:
    print('delete total: ' + str(r.delete(*total_keys)))
common_keys = r.keys(COMMON.format('*'))
if common_keys:
    print('delete common: ' + str(r.delete(*common_keys)))

def incr(i):
    for word in questions_titles[i]:
        r.incr(TOTAL.format(word))
        if word in questions_topics_w[i]:
            r.incr(COMMON.format(word))

pool = Pool(processes=1)

length = 10000
for _ in tqdm_notebook(pool.imap_unordered(incr, range(length)), total=length):
    pass


delete total: 21394
delete common: 1095



In [33]:
from collections import defaultdict
from tqdm import tqdm_notebook

total, common, usefulness = [defaultdict(int) for _ in range(3)]

length = len(questions_titles)
for i in tqdm_notebook(range(length)):
    for word in questions_titles[i]:
        total[word] += 1
        if word in questions_topics_w[i]:
            common[word] += 1

for word in common:
    usefulness[word] = common[word] / total[word]




In [35]:
import json
with open('usefulness.txt', 'w') as f:
    f.write(json.dumps(usefulness))

### Some titles with basic recommender

In [40]:
def basic_recommender(title, usefulness=usefulness):
    return Counter({word: usefulness.get(word, 0) 
                    for word in title})
for i in range(10):
    print(basic_recommender(questions_titles[i]).most_common(3))

[('w305', 0), ('w13549', 0), ('w22752', 0)]
[('w377', 0.142865451725475), ('w54', 0), ('w285', 0)]
[('w15450', 0.5002562788313686), ('w875', 0), ('w42394', 0)]
[('w138', 0.5672250859106529), ('w8646', 0), ('w2744', 0)]
[('w380', 0.8488372093023255), ('w54', 0), ('w674', 0)]
[('w133', 0), ('w54', 0), ('w134', 0)]
[('w27', 0.4324342105263158), ('w75', 0.15127803860198227), ('w686', 0)]
[('w2486', 0.4248441468483029), ('w2298', 0), ('w109', 0)]
[('w3297', 0.22171405846092662), ('w1652', 0.021534625410857985), ('w1448', 0)]
[('w2218', 0.9133333333333333), ('w54', 0), ('w1038', 0)]


In [46]:
from itertools import combinations

G = nx.Graph()

length = len(questions_titles)
for i in tqdm_notebook(range(length)):
    for tag in questions_topics_w[i]:
        if not tag:
            continue
        if not G.has_node(tag):
            G.add_node(tag)
            G.node[tag]['tag_count'] = 1
        else:
            G.node[tag]['tag_count'] += 1
    for edge in combinations(questions_topics_w[i], 2):
        ni, nj = edge
        if not G.has_edge(ni, nj):
            G.add_edge(ni, nj, weight=1)
        else:
            G.edge[ni][nj]['weight'] += 1




In [47]:
print(G.node['w3297'])
print(G.edge['w3297'])

{'tag_count': 2487}
{'w2781': {'weight': 109}, 'w1447,w54,w1448': {'weight': 108}, 'w3396': {'weight': 105}, 'w126': {'weight': 236}, 'w1784,w126': {'weight': 12}, 'w638': {'weight': 4}, 'w2706': {'weight': 9}, 'w6895': {'weight': 96}, 'w1056': {'weight': 11}, 'w3585': {'weight': 83}, 'w839': {'weight': 8}, 'w3686': {'weight': 33}, 'w11051': {'weight': 23}, 'w3692': {'weight': 12}, 'w2686': {'weight': 54}, 'w320': {'weight': 48}, 'w52092': {'weight': 16}, 'w12758': {'weight': 131}, 'w467': {'weight': 24}, 'w3385': {'weight': 91}, 'w205,w54,w206': {'weight': 4}, 'w88378': {'weight': 3}, 'w138': {'weight': 15}, 'w2486': {'weight': 11}, 'w4372': {'weight': 11}, 'w1404': {'weight': 192}, 'w993': {'weight': 16}, 'w5242': {'weight': 8}, 'w10437': {'weight': 2}, 'w138,w54,w41088': {'weight': 5}, 'w7724,w3034': {'weight': 10}, 'w125,w54,w674,w113,w54,w56,w57,w614': {'weight': 13}, 'w8273': {'weight': 65}, 'w33432': {'weight': 3}, 'w4748': {'weight': 19}, 'w4105,w54,w651': {'weight': 1}, 'w92':

In [50]:
%pdb

Automatic pdb calling has been turned ON


In [57]:
def combine_tags(c1, c2):
    for w in c1:
        if w in c2:
            c1[w] = 1 - (1 - c1[w]) * (1 - c2[w])
    return Counter(dict(c2, **c1))

def tag2tagrecommender(tags, n_recs):
    total_scores = Counter()
    tags -= Counter()

    for tag in tags:
        tag_scores = Counter({
            nj: tags[tag]*G.edge[tag][nj]['weight']/G.node[tag]['tag_count'] 
            for _, nj in G.edges(tag)
        })
        tag_scores = Counter(dict(tag_scores.most_common(n_recs)))
        total_scores = combine_tags(total_scores, tag_scores)
    return total_scores

_tags = basic_recommender(questions_titles[3])
tag2tagrecommender(_tags, 100).most_common(5)

[('w125,w54,w674,w113,w54,w56,w57,w614', 0.13513228738749977),
 ('w398', 0.10309716375471525),
 ('w397', 0.08104123538055608),
 ('w138,w54,w315', 0.06330750622669322),
 ('w138,w54,w521', 0.05599790460413326)]

In [58]:
print(questions_topics_w[3])
print(_tags)
print(questions_titles[3])

['w8652,w54,w674,w8647,w614', 'w138']
Counter({'w138': 0.5672250859106529})
['w8646', 'w2744', 'w1462', 'w9', 'w54', 'w138', 'w54', 'w50', 'w110', 'w140344', 'w111', 'w112', 'w49270', 'w2129', 'w6', 'w6978', 'w359', 'w10147', 'w111']


In [65]:
Y = nx.Graph()

length = len(questions_titles)
for i in tqdm_notebook(range(length)):
    for word in questions_titles[i]:
        if word and not Y.has_node(word):
            Y.add_node(word, is_word=1)
            Y.node[word]['count'] = 1
        elif Y.node[word].get('is_word') == 1:
            Y.node[word]['count'] += 1
        else:
            Y.node[word]['is_word'] = 1
            Y.node[word]['count'] = 1
        
        for tag in questions_topics_w[i]:
            if tag and not Y.has_node(tag):
                Y.add_node(tag, is_tag=1)
            else:
                Y.node[tag]['is_tag'] = 1
                
            if not Y.has_edge(word, tag):
                Y.add_edge(word, tag, weight=1)
            else:
                Y.edge[word][tag]['weight'] += 1




In [66]:
import pickle

with open('title2tag_graph.pkl', 'wb') as output:
    pickle.dump(Y, output, pickle.HIGHEST_PROTOCOL)

In [70]:
def title2tagrecommender(tags, n_recs=10):
    total_scores = Counter()
    
    for word in tags:
        if Y.has_node(word) and Y.node[word].get('count'):
            tag_scores = Counter({nj: Y.edge[word][nj]['weight'] / Y.node[word]['count'] 
                                  for _, nj in Y.edges(word) 
                                  if Y.node[nj].get('is_tag')==1})
            tag_scores = Counter(dict(tag_scores.most_common(n_recs)))
            total_scores = combine_tags(total_scores, tag_scores)
    return total_scores - Counter()

_tags = basic_recommender(questions_titles[3])
title2tagrecommender(_tags, 10).most_common()

[('w138', 0.737233026544827),
 ('w125,w54,w674,w113,w54,w56,w57,w614', 0.5502794648129494),
 ('w8652,w54,w674,w8647,w614', 0.5423443634324172),
 ('w398', 0.5283537954707014),
 ('w36949', 0.4175546831183399),
 ('w31564', 0.25),
 ('w687', 0.25),
 ('w6070,w54,w31013', 0.25),
 ('w1405', 0.25),
 ('w3058', 0.25),
 ('w5645', 0.25),
 ('w3054', 0.25),
 ('w397', 0.22305739707757755),
 ('w1099', 0.1674562354279927),
 ('w138,w54,w315', 0.13639533015975003),
 ('w138,w54,w521', 0.13036941580756015),
 ('w27', 0.12772251295059311),
 ('w236', 0.11997167138810198),
 ('w2490', 0.11464398613181492),
 ('w993', 0.11100928199152016),
 ('w138,w54,w41088', 0.10674398625429553),
 ('w1295', 0.10551996801647845),
 ('w1295,w54,w398', 0.10403813796971396),
 ('w3585', 0.0934312702128014),
 ('w139', 0.08097079037800688),
 ('w2686', 0.07926097855633607),
 ('w467', 0.07882302405498282),
 ('w628', 0.07859782137256865),
 ('w872', 0.07726528606647809),
 ('w1564', 0.06884884181856765),
 ('w2712', 0.06243972592896374),
 ('w

In [67]:
def normalize(rec, weight=1):
    try:
        max_score = rec[max(rec, key=rec.get)]
        for tag in rec:
            rec[tag] = weight * rec[tag] / max_score
    except:
        rec = Counter()
    return rec

def meta_recommender(recomendations, weights):
    total = Counter()
    for rec, weight in zip(recomendations, weights):
        rec = normalize(rec, weight)
        total = combine_tags(total, rec)
    
    return total

In [73]:
title = questions_titles[3]
basic = basic_recommender(title)
tag2tag = tag2tagrecommender(basic, 10)
title2tag = title2tagrecommender(basic, 10)

recommendations = meta_recommender([basic, tag2tag, title2tag],
                                   [0.6, 0.33, 0.33])
recommendations.most_common(5)

[('w138', 0.732),
 ('w125,w54,w674,w113,w54,w56,w57,w614', 0.40785800833017805),
 ('w398', 0.3946266678967565),
 ('w397', 0.2831972173907158),
 ('w138,w54,w315', 0.20414298590537883)]

In [74]:
length = 1000
rec_list = []
for i in tqdm_notebook(range(length)):
    basic = basic_recommender(questions_titles[i])
    tag2tag = tag2tagrecommender(basic, 10)
    title2tag = title2tagrecommender(basic, 10)
    recommendations = meta_recommender([basic, tag2tag, title2tag],
                                       [0.6, 0.33, 0.33])
    rec_list.append(recommendations.most_common(5))




In [75]:
topic_word_map = {word: _id for _id, word in topic_id_map.items()}
topic_word_map['w138']

'3195914392210930723'

In [78]:
result = [[topic_word_map.get(i[0]) for i in rec] for rec in rec_list]
result[3]

['3195914392210930723',
 '3804601920633030746',
 '6940355838132160535',
 '8313816860478517392',
 '6718676536613592056']

## title2tag is impressive, tag2tag is not.

In [79]:
from evaluate import evaluate
evaluate(zip(result, questions_topics[:1000]))

(0.24639648491861632, 0.8950552579272336, 0.3399915002124947, 800, 2353)

In [81]:
indexs = [i for i, v in enumerate(result) if v]
_result = [v for i, v in enumerate(result) if v]
_topics = [v for i, v in enumerate(questions_topics[:1000]) if i in indexs]
evaluate(zip(_result, _topics))

(0.3307230082689076, 1.2177622556833112, 0.4540295119182747, 800, 1762)

In [82]:
length = 1000
rec_list = []
for i in tqdm_notebook(range(length)):
    basic = basic_recommender(questions_titles[i])
#     tag2tag = tag2tagrecommender(basic, 10)
    title2tag = title2tagrecommender(basic, 10)
    recommendations = meta_recommender([basic, title2tag],
                                       [0.6, 0.33])
    rec_list.append(recommendations.most_common(5))

result = [[topic_word_map.get(i[0]) for i in rec] for rec in rec_list]

indexs = [i for i, v in enumerate(result) if v]
_result = [v for i, v in enumerate(result) if v]
_topics = [v for i, v in enumerate(questions_topics[:1000]) if i in indexs]
evaluate(zip(_result, _topics))




(0.4544941848862406, 1.5552067685406048, 0.6421589460263494, 1511, 2353)

In [83]:
length = 1000
rec_list = []
for i in tqdm_notebook(range(length)):
    basic = basic_recommender(questions_titles[i])
    tag2tag = tag2tagrecommender(basic, 10)
#     title2tag = title2tagrecommender(basic, 10)
    recommendations = meta_recommender([basic, tag2tag],
                                       [0.6, 0.33])
    rec_list.append(recommendations.most_common(5))

result = [[topic_word_map.get(i[0]) for i in rec] for rec in rec_list]

indexs = [i for i, v in enumerate(result) if v]
_result = [v for i, v in enumerate(result) if v]
_topics = [v for i, v in enumerate(questions_topics[:1000]) if i in indexs]
evaluate(zip(_result, _topics))




(0.33021556299515376, 1.2149660092378585, 0.4534619750283768, 799, 1762)