In [1]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences('/Users/MK/GitHub/the_answer_is/data/temporary2') )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn):
    string = unicode(string, encoding='utf-8', errors='replace').lower()
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        #lengthlist = lengthlist.set_value(j, np.linalg.norm(first_vector-dataframe[j]) )
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
    return lengthlist.sort_values().head(topn)

def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None

def print_names_of_close_documents(path, close_documents_list, outputpath):
    file = open(outputpath + '/close_documents.txt', 'w+')
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        print name
        file.write(text + '\n' + '\n' + '\n' + '\n')        
    print 'text saved as txt'

def get_contents_of_close_documents_by_paragraph(path, close_documents_list):
    merged = ''
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        merged = merged + '\n' + text        
    return merged.split('\n')


def get_distance_between_two_documents(A, B, dictionary):
    import re
    if A:
        A = re.sub(r'[^a-zA-Z ]',r'',A).lower().split()
    else:
        A = ['the']        # just in case A, or B is empty. 
    A = dictionary.doc2bow(A)
    A = transform_tuple_into_vector( A ,dictionary)
    
    if B:
        B = re.sub(r'[^a-zA-Z ]',r'',B).lower().split()
    else:
        B = ['the']       # just in case A, or B is empty. 
    B = dictionary.doc2bow(B)
    B = transform_tuple_into_vector( B ,dictionary)
    #length = spatial.distance.cosine(A,B)
    # I would like to use cosine distance, but the vectors are so sparse that most of the time the output is 0 .
    # So we use euclidean distance. 
    length = np.linalg.norm(A-B)
    return length


stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']

In [2]:
dictionary_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  # question text from training data
dictionary = get_dictionary(dictionary_path)
print (dictionary.token2id)

{u'xylem': 0, u'limited': 1, u'similarity': 2, u'phenotypical': 3, u'alleles': 4, u'magnetic': 5, u'saves': 6, u'desirable': 7, u'crumpled': 145, u'yellow': 9, u'chaos': 4364, u'four': 11, u'tensile': 12, u'heliocentric': 13, u'controversial': 14, u'consists': 15, u'oldest': 17, u'worked': 18, u'aggression': 19, u'poorly': 20, u'relationships': 21, u'whose': 22, u'fronts': 23, u'pedro': 24, u'calculate': 26, u'plumes': 3446, u'electricity': 27, u'powdery': 28, u'metamorphic': 29, u'seriously': 1709, u'supported': 4579, u'list': 2607, u'strawberries': 5157, u'investigation': 1710, u'swap': 33, u'caused': 2543, u'recycle': 35, u'herbicide': 36, u'shepherd': 4173, u'quartzite': 37, u'divergent': 38, u'humerus': 39, u'hormone': 40, u'risk': 41, u'downstream': 42, u'geology': 43, u'barium': 44, u'void': 45, u'oceans': 46, u'connects': 47, u'pigment': 49, u'replication': 50, u'every': 51, u'jack': 52, u'peripheral': 53, u'formula': 1713, u'overcomes': 55, u'vastly': 334, u'incubated': 3050, 

In [3]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
wiki_tuple = get_document_tuple( wiki_path, dictionary )
wiki_tuple[3]

#this wiki_vector is in a condensed form. We need to transform it into a long form for getting distance.  

[(11, 2),
 (114, 1),
 (316, 1),
 (371, 1),
 (691, 2),
 (713, 1),
 (861, 1),
 (864, 1),
 (1230, 1),
 (1255, 1),
 (1290, 1),
 (1395, 1),
 (1556, 1),
 (1699, 1),
 (1700, 1),
 (1796, 1),
 (1835, 1),
 (1875, 1),
 (1885, 1),
 (1912, 1),
 (2100, 1),
 (2215, 1),
 (2347, 2),
 (2374, 1),
 (2384, 1),
 (2409, 1),
 (2521, 1),
 (2541, 1),
 (2642, 1),
 (2716, 1),
 (2734, 1),
 (2755, 1),
 (2824, 8),
 (2875, 1),
 (2939, 1),
 (2943, 1),
 (2953, 1),
 (2979, 1),
 (3072, 1),
 (3090, 1),
 (3149, 1),
 (3202, 1),
 (3225, 1),
 (3316, 1),
 (3369, 1),
 (3457, 2),
 (3464, 2),
 (3483, 1),
 (3569, 1),
 (3663, 1),
 (3714, 2),
 (3809, 1),
 (3843, 1),
 (3865, 1),
 (4110, 1),
 (4174, 1),
 (4280, 1),
 (4284, 1),
 (4360, 1),
 (4386, 1),
 (4506, 1),
 (4592, 1),
 (4636, 1),
 (4667, 1),
 (4704, 1),
 (4790, 4),
 (4844, 4),
 (5067, 1),
 (5127, 1),
 (5136, 1),
 (5168, 1)]

In [4]:
print 'dictionary dimension is', len(dictionary.token2id)

dictionary dimension is 5287


In [5]:
df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary)         


In [6]:
df_wiki_vector.head(15)
# this is each column representing one wikipedia page

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2131,2132,2133,2134,2135,2136,2137,2138,2139,2140
0,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
1,0,0.0,0,0.0,0.0,0.000536,0,0,0,0,...,0.0,0.000611,0.001134,0.0,0,0,0.002725,0,0.000805,0.0
2,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.000567,0.0,0,0,0.0,0,0.0,0.0
3,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
4,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
5,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
6,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
7,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.000263,0.0,0.0,0.00062,0,0,0.0,0,0.0,0.0
8,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.000403,0.0
9,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.005105,0.0,0,0,0.0,0,0.0,0.0


In [7]:
import pandas as pd
import os
path = '/Users/MK/GitHub/the_answer_is/data'
os.chdir(path)
train = pd.read_table('training_set.tsv',sep = '\t')
train.head(20)

Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
0,100001,"When athletes begin to exercise, their heart r...",C,at the tissue level,at the organ level,at the system level,at the cellular level
1,100002,Which example describes a learned behavior in ...,C,smelling the air for odors,barking when disturbed,sitting on command,digging in soil
2,100003,"When two nuclei are combined into one nucleus,...",D,conversion,reaction,fission,fusion
3,100004,Which is a distinction between an epidemic and...,B,the symptoms of the disease,the geographical area affected,the species of organisms infected,the season in which the disease spreads
4,100005,In which way is the orbit of a comet different...,B,The orbit of Earth is less circular than the o...,The orbit of a comet is more elliptical than t...,The orbital period of Earth is much longer tha...,The orbital period of a comet is more predicta...
5,100006,A teacher builds a model of a hydrogen atom. A...,B,number of particles,relative mass of particles,types of particles present,charges of particles present
6,100007,Which substance should a student apply to the ...,A,water,vinegar,salt,formaldehyde
7,100008,What is the main source of energy for the wate...,A,the Sun,fossil fuels,clouds,the ocean
8,100009,Which has the greatest effect on aiding the mo...,D,tension,friction,density,gravity
9,100010,"Over time, non-volcanic mountains can form due...",C,oceanic plates colliding with oceanic plates,oceanic plates separating from oceanic plates,continental plates colliding with continental ...,continental plates separating from continental...


In [8]:
q = train.ix[3][1]
print q

Which is a distinction between an epidemic and a pandemic?


In [9]:
close_documents = get_close_documents(q, df_wiki_vector, dictionary,5)
print close_documents

1037    0.948472
1399    0.957690
2083    0.970740
1570    0.970946
1572    0.970946
dtype: float64


In [10]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
outputpath = '/Users/MK/GitHub/the_answer_is/data'
print_names_of_close_documents(wiki_path, close_documents,outputpath)
merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)

model = gensim.models.Word2Vec(sentences,size=100, window=5, min_count=5, workers=4)

invertebrates.txt_to_unicode_remove_stopwords.txt
organic_compounds_in_life_science.txt_to_unicode_remove_stopwords.txt
viruses_and_human_disease.txt_to_unicode_remove_stopwords.txt
prokaryote_classification.txt_to_unicode_remove_stopwords.txt
prokaryote_habitats.txt_to_unicode_remove_stopwords.txt
text saved as txt


In [11]:
print merged

[u'', u'invertebrates animals neither possess develop vertebral columnspinal cord derived notochord includes animals apart subphylum vertebrata familiar examples invertebrates include insects crabs lobsters kin snails clams octopuses kin starfish seaurchins kin worms', u'majority animal species invertebrates one estimate puts figure many invertebrate taxa greater number variety species entire subphylum vertebrata', u'socalled invertebrates chaetognatha hemichordata tunicata cephalochordata closely related vertebrates invertebrates makes term invertebrate almost meaningless taxonomic purposes', u'etymology', u'word invertebrate comes form latin word vertebra means joint general sometimes specifically joint spinal column vertebrate turn jointed aspect vertebra derived concept turning expressed root verto vorto turn coupled prefix meaning without', u'taxonomic significance', u'term invertebrates always precise among nonbiologists since accurately describe taxon way arthropoda vertebrata m

In [26]:
def get_my_answer(train, dictionary, df_vector, path):
    convert_answer = {3: 'A', 4: 'B', 5: 'C', 6: 'D' }
    myanswer = pd.Series()    #initialize dataframe to store my answers
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)
        four_choices = pd.Series()         # initialize a series to store the best value for each answer. 
        for j in [3,4,5,6]:            # the columns where the answer option lies. 
            A = train.ix[i][1] + ' ' + train.ix[i][j]       # question + each answer choice. 
            dist_list = []             # for storing all distance between A and all paragraphs in close documents. 
            for m in xrange(len(merged)):
                distance = get_distance_between_two_documents(A, merged[m], dictionary)
                if distance > 0:  #to disregrad nan value
                    dist_list.append( distance )
            four_choices = four_choices.set_value( j, min(dist_list)  )
            #print np.std(dist_list)
            #print min(dist_list) 
        myanswer = myanswer.set_value(i, convert_answer[ four_choices.argmin() ])
        print i, convert_answer[ four_choices.argmin() ]
    return myanswer

def get_my_answer_all_distance(train, dictionary, df_vector, path):
    convert_answer = {3: 'A', 4: 'B', 5: 'C', 6: 'D' }
    myanswer_distance = pd.DataFrame(np.zeros(4).reshape(1,4), columns = ['A','B','C','D'])    #initialize dataframe to store my answers distance
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)
        four_choices = pd.Series()         # initialize a series to store the best value for each answer. 
        for j in [3,4,5,6]:            # the columns where the answer option lies. 
            A = train.ix[i][1] + ' ' + train.ix[i][j]       # question + each answer choice. 
            dist_list = []             # for storing all distance between A and all paragraphs in close documents. 
            for m in xrange(len(merged)):
                distance = get_distance_between_two_documents(A, merged[m], dictionary)
                if distance > 0:  #to disregrad nan value
                    dist_list.append( distance )
            four_choices = four_choices.set_value( j, min(dist_list)  )
            #print np.std(dist_list)
            #print min(dist_list) 
        myanswer_distance.set_value(i, 'A', four_choices[3] )
        myanswer_distance.set_value(i, 'B', four_choices[4] )
        myanswer_distance.set_value(i, 'C', four_choices[5] )
        myanswer_distance.set_value(i, 'D', four_choices[6] )
        print i, four_choices[3], four_choices[4], four_choices[5], four_choices[6]
    return myanswer_distance

In [69]:
# if you want just the answers
#wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
#myans = get_my_answer(train, dictionary, df_wiki_vector, wiki_path)
#train['fetch_doc_ws_train_answer'] = myans
#train['fetch_doc_ws_train_correct'] = (train['correctAnswer'] == train['fetch_doc_ws_train_answer'])
#print 'percent correct is ' , train['fetch_doc_ws_train_correct'].sum(axis =0) / (len(train) + 0.0)
#train.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_train.csv', encoding='utf-8')

0 A
1 A
2 A
3 C
4 C
5 B
6 A
7 B
8 B
9 D
10 C
11 A
12 A
13 A
14 A
15 A
16 C
17 B
18 A
19 B
20 C
21 B
22 C
23 B
24 B
25 D
26 B
27 A
28 B
29 A
30 D
31 C
32 D
33 A
34 C
35 A
36 C
37 B
38 A
39 C
40 B
41 B
42 C
43 B
44 A
45 A
46 A
47 B
48 A
49 A
50 D
51 A
52 A
53 B
54 D
55 D
56 C
57 A
58 B
59 C
60 A
61 B
62 A
63 D
64 A
65 A
66 A
67 A
68 A
69 A
70 C
71 B
72 A
73 A
74 A
75 A
76 A
77 D
78 C
79 A
80 A
81 D
82 B
83 D
84 A
85 B
86 A
87 C
88 C
89 A
90 C
91 B
92 D
93 A
94 A
95 B
96 A
97 B
98 A
99 B
100 B
101 A
102 D
103 D
104 A
105 C
106 A
107 B
108 B
109 C
110 B
111 D
112 A
113 A
114 C
115 A
116 D
117 C
118 A
119 B
120 A
121 A
122 D
123 A
124 A
125 C
126 D
127 C
128 B
129 D
130 C
131 B
132 C
133 A
134 C
135 A
136 B
137 D
138 A
139 C
140 C
141 A
142 A
143 D
144 B
145 C
146 A
147 A
148 C
149 D
150 B
151 A
152 B
153 A
154 B
155 D
156 A
157 D
158 A
159 A
160 A
161 C
162 A
163 B
164 A
165 B
166 C
167 D
168 A
169 D
170 C
171 C
172 B
173 A
174 C
175 A
176 D
177 B
178 D
179 A
180 D
181 A
182 A
183 A
184 D


In [27]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
myans = get_my_answer_all_distance(train, dictionary, df_wiki_vector, wiki_path)
myans.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_train_minimum_distance.csv', encoding='utf-8')

0 0.279508497187 0.279508497187 0.279508497187 0.279508497187
1 0.408248290464 0.408248290464 0.408248290464 0.408248290464
2 0.258198889747 0.258198889747 0.258198889747 0.258198889747
3 0.4472135955 0.4472135955 0.408248290464 0.408248290464
4 0.37267799625 0.390022779585 0.342727379489 0.382047844058
5 0.264575131106 0.256436419387 0.256436419387 0.256436419387
6 0.345795680681 0.349540869491 0.349540869491 0.366681885507
7 0.374350648863 0.357118140519 0.374350648863 0.374350648863
8 0.368567275724 0.352323635943 0.352323635943 0.352323635943
9 0.290081385603 0.282206298752 0.284858223105 0.277119910584
10 0.25893090163 0.25893090163 0.255261472546 0.255261472546
11 0.353553390593 0.353553390593 0.353553390593 0.353553390593
12 0.242076711427 0.242076711427 0.248964798866 0.242076711427
13 0.252412185438 0.252412185438 0.263157894737 0.264575131106
14 0.285714285714 0.285714285714 0.297921795862 0.311804782231
15 0.381924948997 0.381924948997 0.381924948997 0.403980197534
16 0.2672