In [1]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences('/Users/MK/GitHub/the_answer_is/data/temporary2') )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn):
    string = unicode(string, encoding='utf-8', errors='replace').lower()
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        #lengthlist = lengthlist.set_value(j, np.linalg.norm(first_vector-dataframe[j]) )
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
    return lengthlist.sort_values().head(topn)

def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None

def print_names_of_close_documents(path, close_documents_list, outputpath):
    file = open(outputpath + '/close_documents.txt', 'w+')
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        print name
        file.write(text + '\n' + '\n' + '\n' + '\n')        
    print 'text saved as txt'

def get_contents_of_close_documents_by_paragraph(path, close_documents_list):
    merged = ''
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        merged = merged + '\n' + text        
    return merged.split('\n')


def get_distance_between_two_documents(A, B, dictionary):
    import re
    if A:
        A = re.sub(r'[^a-zA-Z ]',r'',A).lower().split()
    else:
        A = ['the']        # just in case A, or B is empty. 
    A = dictionary.doc2bow(A)
    A = transform_tuple_into_vector( A ,dictionary)
    
    if B:
        B = re.sub(r'[^a-zA-Z ]',r'',B).lower().split()
    else:
        B = ['the']       # just in case A, or B is empty. 
    B = dictionary.doc2bow(B)
    B = transform_tuple_into_vector( B ,dictionary)
    #length = spatial.distance.cosine(A,B)
    # I would like to use cosine distance, but the vectors are so sparse that most of the time the output is 0 .
    # So we use euclidean distance. 
    length = np.linalg.norm(A-B)
    return length


stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']

In [2]:
dictionary_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  # question text from training data
dictionary = get_dictionary(dictionary_path)
print (dictionary.token2id)

{u'xylem': 0, u'limited': 1, u'similarity': 2, u'phenotypical': 3, u'alleles': 4, u'magnetic': 5, u'saves': 6, u'desirable': 7, u'crumpled': 145, u'yellow': 9, u'chaos': 4364, u'four': 11, u'tensile': 12, u'heliocentric': 13, u'controversial': 14, u'consists': 15, u'oldest': 17, u'worked': 18, u'aggression': 19, u'poorly': 20, u'relationships': 21, u'whose': 22, u'fronts': 23, u'pedro': 24, u'calculate': 26, u'plumes': 3446, u'electricity': 27, u'powdery': 28, u'metamorphic': 29, u'seriously': 1709, u'supported': 4579, u'list': 2607, u'strawberries': 5157, u'investigation': 1710, u'swap': 33, u'caused': 2543, u'recycle': 35, u'herbicide': 36, u'shepherd': 4173, u'quartzite': 37, u'divergent': 38, u'humerus': 39, u'hormone': 40, u'risk': 41, u'downstream': 42, u'geology': 43, u'barium': 44, u'void': 45, u'oceans': 46, u'connects': 47, u'pigment': 49, u'replication': 50, u'every': 51, u'jack': 52, u'peripheral': 53, u'formula': 1713, u'overcomes': 55, u'vastly': 334, u'incubated': 3050, 

In [3]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
wiki_tuple = get_document_tuple( wiki_path, dictionary )
wiki_tuple[3]

#this wiki_vector is in a condensed form. We need to transform it into a long form for getting distance.  

[(11, 2),
 (114, 1),
 (316, 1),
 (371, 1),
 (691, 2),
 (713, 1),
 (861, 1),
 (864, 1),
 (1230, 1),
 (1255, 1),
 (1290, 1),
 (1395, 1),
 (1556, 1),
 (1699, 1),
 (1700, 1),
 (1796, 1),
 (1835, 1),
 (1875, 1),
 (1885, 1),
 (1912, 1),
 (2100, 1),
 (2215, 1),
 (2347, 2),
 (2374, 1),
 (2384, 1),
 (2409, 1),
 (2521, 1),
 (2541, 1),
 (2642, 1),
 (2716, 1),
 (2734, 1),
 (2755, 1),
 (2824, 8),
 (2875, 1),
 (2939, 1),
 (2943, 1),
 (2953, 1),
 (2979, 1),
 (3072, 1),
 (3090, 1),
 (3149, 1),
 (3202, 1),
 (3225, 1),
 (3316, 1),
 (3369, 1),
 (3457, 2),
 (3464, 2),
 (3483, 1),
 (3569, 1),
 (3663, 1),
 (3714, 2),
 (3809, 1),
 (3843, 1),
 (3865, 1),
 (4110, 1),
 (4174, 1),
 (4280, 1),
 (4284, 1),
 (4360, 1),
 (4386, 1),
 (4506, 1),
 (4592, 1),
 (4636, 1),
 (4667, 1),
 (4704, 1),
 (4790, 4),
 (4844, 4),
 (5067, 1),
 (5127, 1),
 (5136, 1),
 (5168, 1)]

In [4]:
print 'dictionary dimension is', len(dictionary.token2id)

dictionary dimension is 5287


In [5]:
df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary)         


In [6]:
df_wiki_vector.head(15)
# this is each column representing one wikipedia page

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2131,2132,2133,2134,2135,2136,2137,2138,2139,2140
0,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
1,0,0.0,0,0.0,0.0,0.000536,0,0,0,0,...,0.0,0.000611,0.001134,0.0,0,0,0.002725,0,0.000805,0.0
2,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.000567,0.0,0,0,0.0,0,0.0,0.0
3,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
4,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
5,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
6,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
7,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.000263,0.0,0.0,0.00062,0,0,0.0,0,0.0,0.0
8,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.000403,0.0
9,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.005105,0.0,0,0,0.0,0,0.0,0.0


In [7]:
import pandas as pd
import os
path = '/Users/MK/GitHub/the_answer_is/data'
os.chdir(path)
train = pd.read_table('training_set_blank.tsv',sep = '\t',header = -1)
train.columns = ['id', 'question', 'correctAnswer', 'answerA', 'answerB', 'answerC', 'answerD']
train.head(20)

Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
0,100028,When light passes through the lens of your eye...,D,a smaller,a larger,an upright,an inverse
1,100035,"Populations respond, over time, to changing en...",A,adaptation,competition,ecological succession,extinction
2,100036,The number and sequence of amino acids in a pr...,D,chemical properties,shape,size,All of the above
3,100050,When the core of a protostar has reached aroun...,B,red giant,star,super red giant,black hole
4,100053,Male testes are located in an external scrotum...,A,normal body temperature is too warm for sperm ...,an external scrotum offers better protection f...,sperm is produced in the testes and stored in ...,sperm cells must mix with seminal fluid before...
5,100065,The process of moving gases into and out of an...,D,the digestive system,the nervous system,the excretory system,the respiratory system
6,100070,The relationship between force and mass is rep...,C,increase,not change,decrease,change direction
7,100090,The function of the mitochondria in plant cell...,A,"release energy for cell growth, development, a...",attach carbohydrates and lipids to proteins,make energy-rich food molecules,assemble proteins
8,100098,Fossil evidence can be used to tell us about a...,D,biological diversity (the kinds of organisms t...,episodic speciation (new species suddenly appe...,mass extinction (sudden disappearance of all e...,mutation rate (how fast mutations occur within...
9,100102,Paleontologists find and study fossilized orga...,D,ancient life forms in younger layers of sedime...,recently evolved organisms in undisturbed sedi...,ancient life forms in overturned sedimentary r...,recently evolved organisms in younger layers o...


In [8]:
q = train.ix[3][1]
print q

When the core of a protostar has reached around 10 million K, the pressure within it is so immense that nuclear fusion of hydrogen begins and a __________ is born.


In [9]:
close_documents = get_close_documents(q, df_wiki_vector, dictionary,5)
print close_documents

1365    0.731508
964     0.733596
962     0.733596
1861    0.733596
412     0.734420
dtype: float64


In [10]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
outputpath = '/Users/MK/GitHub/the_answer_is/data'
print_names_of_close_documents(wiki_path, close_documents,outputpath)
merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)

nuclear_fusion.txt_to_unicode_remove_stopwords.txt
hydrogen_bonding_in_physical_science.txt_to_unicode_remove_stopwords.txt
hydrogen_and_alkali_metals_in_physical_science.txt_to_unicode_remove_stopwords.txt
spectral_lines_of_hydrogen.txt_to_unicode_remove_stopwords.txt
conserving_water.txt_to_unicode_remove_stopwords.txt
text saved as txt


In [11]:
print merged

[u'', u'nuclear physics nuclear fusion nuclear reaction two atomic nuclei come close collide high speed join form new nucleus process matter conserved matter fusing nuclei converted photons energy fusion process powers active main sequence stars', u'fusion two nuclei lower masses iron along nickel largest binding energy per nucleon generally releases energy fusion nuclei heavier iron absorbs energy opposite true reverse process nuclear fission means generally lighter elements fusable hydrogen helium likewise generally heavier elements fissionable uranium plutonium extreme astrophysical events lead short periods fusion heavier nuclei process gives rise nucleosynthesis creation heavy elements events supernova', u'following discovery quantum tunneling physicist friedrich hund robert atkinson fritz houtermans used measured masses light elements predict large amounts energy could released fusing small nuclei building upon nuclear transmutation experiments ernest rutherford carried several y

In [12]:
def get_my_answer(train, dictionary, df_vector, path):
    convert_answer = {3: 'A', 4: 'B', 5: 'C', 6: 'D' }
    myanswer = pd.Series()    #initialize dataframe to store my answers
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)
        four_choices = pd.Series()         # initialize a series to store the best value for each answer. 
        for j in [3,4,5,6]:            # the columns where the answer option lies. 
            A = train.ix[i][1] + ' ' + train.ix[i][j]       # question + each answer choice. 
            dist_list = []             # for storing all distance between A and all paragraphs in close documents. 
            for m in xrange(len(merged)):
                distance = get_distance_between_two_documents(A, merged[m], dictionary)
                if distance > 0:  #to disregrad nan value
                    dist_list.append( distance )
            four_choices = four_choices.set_value( j, min(dist_list)  )
            #print np.std(dist_list)
            #print min(dist_list) 
        myanswer = myanswer.set_value(i, convert_answer[ four_choices.argmin() ])
        print i, convert_answer[ four_choices.argmin() ]
    return myanswer

In [13]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
myans = get_my_answer(train, dictionary, df_wiki_vector, wiki_path)


0 A
1 C
2 A
3 A
4 A
5 A
6 A
7 A
8 B
9 A
10 A
11 C
12 A
13 C
14 D
15 A
16 C
17 C
18 C
19 A
20 A
21 B
22 C
23 B
24 A
25 C
26 C
27 A
28 A
29 D
30 D
31 B
32 A
33 A
34 C
35 C
36 A
37 A
38 B
39 A
40 A
41 A
42 C
43 C
44 B
45 A
46 A
47 B
48 B
49 A
50 A
51 A
52 A
53 A
54 A
55 D
56 A
57 C
58 B
59 B
60 B
61 B
62 A
63 C
64 B
65 C
66 D
67 C
68 B
69 A
70 A
71 C
72 D
73 B
74 D
75 A
76 D
77 C
78 A
79 D
80 C
81 C
82 A
83 A
84 B
85 A
86 A
87 B
88 A
89 A
90 A
91 C
92 D
93 B
94 D
95 A
96 A
97 D
98 D
99 B
100 A
101 D
102 A
103 C
104 C
105 B
106 D
107 C
108 A
109 C
110 A
111 C
112 B
113 B
114 A
115 B
116 B
117 C
118 A
119 A
120 C
121 D
122 B
123 C
124 B
125 A
126 B
127 C
128 A
129 C
130 D
131 A
132 B
133 D
134 D
135 C
136 A
137 A
138 B
139 D
140 A
141 C
142 D
143 A
144 B
145 D
146 A
147 D
148 B
149 B
150 D
151 D
152 B
153 B
154 C
155 A
156 B
157 D
158 A
159 B
160 A
161 C
162 D
163 A
164 A
165 B
166 B
167 A
168 A
169 A
170 A
171 C
172 C
173 D
174 B
175 A
176 A
177 A
178 C
179 C
180 A
181 C
182 D
183 A
184 C


In [14]:
train['fetch_doc_ws_blank_answer'] = myans
train['fetch_doc_ws_blank_correct'] = (train['correctAnswer'] == train['fetch_doc_ws_blank_answer'])
print 'percent correct is ' , train['fetch_doc_ws_blank_correct'].sum(axis =0) / (len(train) + 0.0)

percent correct is  0.306049822064


In [15]:
train.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_blank.csv', encoding='utf-8')