In [11]:
import json
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer


# define the lemmatize function
def lmz(word):
    lemmatizer = WordNetLemmatizer()
    word = word.lower()
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma

# get bag-of-words from tokenized document
def get_BOW(text):
    BOW = dict()
    for word in text:
        word_lemma = lmz(word)
        BOW[word_lemma] = BOW.get(word_lemma,0) + 1
    return BOW

In [12]:
doc = json.load(open('documents.json'))

doc_dict = [None] * len(doc)
for doc_item in doc:
    para_dict = list()
    for para in doc_item['text']:
        para_dict.append(get_BOW(nltk.word_tokenize(para)))
    doc_dict[doc_item['docid']] = para_dict

In [24]:
print (doc[1]['text'][1])

Total investment in renewable energy (including small hydro-electric projects) was $244 billion in 2012, down 12% from 2011 mainly due to dramatically lower solar prices and weakened US and EU markets. As a share of total investment in power plants, wind and solar PV grew from 14% in 2000 to over 60% in 2012. The top countries for investment in recent years were China, Germany, Spain, the United States, Italy, and Brazil. Renewable energy companies include BrightSource Energy, First Solar, Gamesa, GE Energy, Goldwind, Sinovel, Trina Solar, Vestas and Yingli.


In [31]:
# get the bag of words representation for each articles. 
# Calculate the accuracy of retrival using the training data
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


train = json.load(open('training.json'))

vec_tfidf_tran = [None] * len(doc_dict)
tf_idf_list = [None] * len(doc_dict)
for doc_index in range(len(doc_dict)):
    vectorizer = DictVectorizer()
    transformer = TfidfTransformer()
    
    term_para_matrix = vectorizer.fit_transform(doc_dict[doc_index])
    tf_idf_matrix = transformer.fit_transform(term_para_matrix)
    tf_idf_list[doc_index] = tf_idf_matrix
    vec_tfidf_tran[doc_index] = (vectorizer, transformer)
    


In [147]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

flag = 0
length = 0
for train_item in train[:200]:
    question = train_item['question']
    vectorizer, transformer = vec_tfidf_tran[train_item['docid']]
    term_parp_matrix = vectorizer.transform(get_BOW(nltk.word_tokenize(question)))
    tf_idf_ma = transformer.transform(term_parp_matrix)
#     print tf_idf_ma.shape
    sim_list = list()
    for index in range(tf_idf_list[train_item['docid']].shape[0]):
        sim_list.append(cosine_similarity(tf_idf_ma, tf_idf_list[train_item['docid']][index,:])[0][0])
    if train_item['answer_paragraph'] in np.argsort(sim_list)[-2:]:
        flag += 1
    length += 1
print flag * 1.0 / float(length)

0.9


In [72]:
print  tf_idf_list[0].shape

(24, 772)


In [96]:
print vec_tfidf_tran[0][0].vocabulary_

{u'all': 54, u'since': 640, u'consider': 153, u'magnetic': 410, u'legal': 394, u'invariably': 365, u'go': 298, u'follow': 281, u'whose': 750, u'careful': 119, u'depend': 185, u'7014540000000000000\u2660540': 30, u'million': 434, u'electricity': 229, u'to': 701, u'charge': 130, u'indicate': 348, u'whatsoever': 743, u'\u03b3b': 765, u'include': 342, u'\u03bd': 769, u'sound': 649, u'outside': 501, u'very': 727, u'rise': 608, u'wave': 738, u'electron': 235, u'ultraviolet': 712, u'decide': 178, u'fall': 272, u'redefinition': 584, u'affect': 50, u'molar': 438, u'7023602200000000000\u26606.022\xd71023': 32, u'prize': 546, u'level': 396, u'list': 404, u'large': 389, u'small': 642, u'equation': 245, u'the': 681, u'rutherford': 613, u'force': 284, u'ten': 676, u'becquerel': 96, u'prediction': 540, u'approximation': 74, u'imply': 337, u'further': 294, u'seafront': 618, u'crystallography': 172, u'substitute': 666, u'inaccurate': 341, u'even': 250, u'appear': 71, u'j\u22c5s': 377, u'klitzing': 385,

In [128]:
collect_inverted_index = list()
# for index in range(len(doc_dict)):
for index in range(1):
    vectorizer = vec_tfidf_tran[index][0]
    word_feature_map = vectorizer.vocabulary_
    inverted_index = [list()] * len(word_feature_map)

    for key in word_feature_map:
        for index_para in range(len(doc_dict[index])):
            if key in doc_dict[index][index_para]:
                inverted_index[word_feature_map[key]].append((index_para, tf_idf_list[index][index_para, word_feature_map[key]]))
        print len(inverted_index[word_feature_map[key]])
    print len(inverted_index)
    collect_inverted_index.append(inverted_index)
        
print len(collect_inverted_index)

5
6
8
10
11
12
13
14
17
18
20
21
23
24
45
49
50
51
52
54
55
56
57
61
63
66
72
73
74
75
76
77
79
80
82
84
85
86
89
91
115
116
117
118
119
122
123
124
125
126
127
128
129
130
132
133
134
136
139
141
142
145
150
153
156
157
158
159
160
161
162
163
164
165
167
171
172
173
175
176
177
178
179
180
181
183
186
188
189
190
191
192
193
194
195
196
197
199
200
218
219
226
227
233
235
236
238
242
244
245
246
250
251
255
263
267
277
281
284
285
286
287
288
289
290
293
295
296
298
309
310
311
316
317
318
319
320
329
331
332
334
335
337
338
348
349
350
352
353
354
355
357
361
362
363
364
366
367
368
369
370
371
372
374
375
377
378
379
380
381
385
388
392
416
417
418
420
429
430
431
432
433
438
443
444
445
446
447
448
450
452
453
455
457
459
461
462
464
465
466
467
468
469
470
494
495
496
497
498
499
500
501
503
504
505
506
507
509
510
517
526
528
529
530
536
537
538
540
544
545
546
547
548
550
551
564
565
568
569
576
578
579
588
591
592
595
596
598
599
600
601
602
604
605
606
608
610
612
613
614
616

In [124]:
print ((collect_inverted_index[0][0][0]))

(3, 0.07207807270692493)


In [133]:
vectorizer = vec_tfidf_tran[index][0]
word_feature_map = vectorizer.vocabulary_
print len(word_feature_map)
m = 0
for k in word_feature_map:
    if word_feature_map[k] > m:
        m = word_feature_map[k]
print m

772
771


In [149]:
flag = 0
length = 0
for train_item in train:
    question = get_BOW(nltk.word_tokenize(train_item['question']))
    vectorizer, transformer = vec_tfidf_tran[train_item['docid']]
    score = [0] * len(doc_dict[train_item['docid']])
    word_feature_map = vectorizer.vocabulary_
    for key in question:
        for index in range(len(doc_dict[train_item['docid']])):
            if key in doc_dict[train_item['docid']][index]:
                score[index] += tf_idf_list[train_item['docid']][index, word_feature_map[key]]
    if train_item['answer_paragraph'] in np.argsort(score)[-5:]:
        flag += 1
    length += 1
print flag * 1.0 / float(length)

0.855137278407


In [150]:
print flag, length

37095 43379


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(term_doc_matrix)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
print tfidf_matrix[0]

  (0, 83952)	0.012605448144426018
  (0, 83712)	0.027277261024867317
  (0, 83249)	0.023613228447920416
  (0, 83248)	0.012605448144426018
  (0, 83155)	0.025210896288852036
  (0, 83154)	0.025210896288852036
  (0, 83142)	0.012605448144426018
  (0, 83141)	0.012605448144426018
  (0, 82770)	0.00551814272948263
  (0, 82254)	0.006515578727884812
  (0, 81830)	0.0038877619324642868
  (0, 81749)	0.0042750915667939884
  (0, 81394)	0.03773834759932068
  (0, 81178)	0.015948323064546965
  (0, 81126)	0.002288783704478098
  (0, 81093)	0.01424673978657076
  (0, 80920)	0.002701095037878986
  (0, 80913)	0.03152266982253713
  (0, 80733)	0.008068971292879314
  (0, 80677)	0.03541984267188063
  (0, 80624)	0.005362522378869883
  (0, 80617)	0.013343129883437704
  (0, 80599)	0.002186679673051478
  (0, 80547)	0.0038182194118387878
  (0, 80523)	0.00209910228339209
  :	:
  (0, 5214)	0.0037955735663730123
  (0, 5172)	0.003562270829739085
  (0, 4809)	0.011239832537475165
  (0, 4364)	0.007336929086160016
  (0, 4361)	0.