In [210]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences('/Users/MK/GitHub/the_answer_is/data/temporary2') )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    for onetuple in document_tuple: 
        vector[onetuple[0]] = onetuple[1]
    vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
    return vector 

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn):
    string = unicode(string, encoding='utf-8', errors='replace')
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    lengthlist = pd.Series()
    for i in xrange(len(dataframe.columns)):
        lengthlist = lengthlist.set_value(i, np.linalg.norm(first_vector-dataframe[i]) )
    return lengthlist.sort_values().head(topn)

def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None

def print_names_of_close_documents(path, close_documents_list, outputpath):
    file = open(outputpath + '/close_documents.txt', 'w+')
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        print name
        file.write(text + '\n' + '\n' + '\n' + '\n')        
    print 'text saved as txt'

stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']

In [194]:
dictionary_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  # question text from training data
dictionary = get_dictionary(dictionary_path)
print (dictionary.token2id)

{u'xylem': 0, u'limited': 1, u'similarity': 2, u'phenotypical': 3, u'alleles': 4, u'magnetic': 5, u'saves': 6, u'desirable': 7, u'crumpled': 145, u'yellow': 9, u'chaos': 4364, u'four': 11, u'tensile': 12, u'heliocentric': 13, u'controversial': 14, u'consists': 15, u'oldest': 17, u'worked': 18, u'aggression': 19, u'poorly': 20, u'relationships': 21, u'whose': 22, u'fronts': 23, u'pedro': 24, u'calculate': 26, u'plumes': 3447, u'electricity': 27, u'powdery': 28, u'metamorphic': 29, u'seriously': 1711, u'supported': 4579, u'list': 2608, u'strawberries': 5157, u'investigation': 1712, u'swap': 33, u'caused': 2544, u'recycle': 35, u'herbicide': 36, u'shepherd': 4174, u'quartzite': 37, u'divergent': 38, u'humerus': 39, u'hormone': 40, u'risk': 41, u'downstream': 42, u'geology': 43, u'barium': 44, u'void': 45, u'oceans': 46, u'connects': 47, u'pigment': 49, u'replication': 50, u'every': 51, u'jack': 52, u'peripheral': 53, u'formula': 1715, u'overcomes': 55, u'vastly': 334, u'incubated': 3051, 

In [195]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
wiki_tuple = get_document_tuple( wiki_path, dictionary )
wiki_tuple[3]

#this wiki_vector is in a condensed form. We need to transform it into a long form for getting distance.  

[(11, 2),
 (114, 1),
 (316, 1),
 (371, 1),
 (691, 2),
 (713, 1),
 (862, 1),
 (865, 1),
 (1231, 1),
 (1256, 1),
 (1291, 1),
 (1396, 1),
 (1557, 1),
 (1701, 1),
 (1702, 1),
 (1798, 1),
 (1836, 1),
 (1876, 1),
 (1886, 1),
 (1913, 1),
 (2101, 1),
 (2216, 1),
 (2348, 2),
 (2375, 1),
 (2385, 1),
 (2410, 1),
 (2522, 1),
 (2542, 1),
 (2643, 1),
 (2717, 1),
 (2735, 1),
 (2756, 1),
 (2825, 8),
 (2876, 1),
 (2940, 1),
 (2944, 1),
 (2954, 1),
 (2980, 1),
 (3073, 1),
 (3091, 1),
 (3150, 1),
 (3203, 1),
 (3226, 1),
 (3317, 1),
 (3370, 1),
 (3458, 2),
 (3465, 2),
 (3484, 1),
 (3570, 1),
 (3664, 1),
 (3715, 2),
 (3810, 1),
 (3844, 1),
 (3866, 1),
 (4111, 1),
 (4175, 1),
 (4281, 1),
 (4285, 1),
 (4360, 1),
 (4386, 1),
 (4506, 1),
 (4592, 1),
 (4636, 1),
 (4667, 1),
 (4704, 1),
 (4790, 4),
 (4844, 4),
 (5067, 1),
 (5127, 1),
 (5136, 1),
 (5168, 1)]

In [196]:
print 'dictionary dimension is', len(dictionary.token2id)

dictionary dimension is 5287


In [197]:
df_wiki_vector = transform_tuples_into_dataframe(wiki_vector,dictionary)         


In [198]:
df_wiki_vector.head(15)
# this is each column representing one wikipedia page

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2131,2132,2133,2134,2135,2136,2137,2138,2139,2140
0,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
1,0,0.0,0,0.0,0.0,0.000536,0,0,0,0,...,0.0,0.000611,0.001134,0.0,0,0,0.002725,0,0.000805,0.0
2,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.000567,0.0,0,0,0.0,0,0.0,0.0
3,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
4,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
5,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
6,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0
7,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.000263,0.0,0.0,0.00062,0,0,0.0,0,0.0,0.0
8,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0.0,0,0.000403,0.0
9,0,0.0,0,0.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.005105,0.0,0,0,0.0,0,0.0,0.0


In [249]:
import pandas as pd
import os
path = '/Users/MK/GitHub/the_answer_is/data'
os.chdir(path)
train = pd.read_table('training_set.tsv',sep = '\t')
train.head(20)

Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
0,100001,"When athletes begin to exercise, their heart r...",C,at the tissue level,at the organ level,at the system level,at the cellular level
1,100002,Which example describes a learned behavior in ...,C,smelling the air for odors,barking when disturbed,sitting on command,digging in soil
2,100003,"When two nuclei are combined into one nucleus,...",D,conversion,reaction,fission,fusion
3,100004,Which is a distinction between an epidemic and...,B,the symptoms of the disease,the geographical area affected,the species of organisms infected,the season in which the disease spreads
4,100005,In which way is the orbit of a comet different...,B,The orbit of Earth is less circular than the o...,The orbit of a comet is more elliptical than t...,The orbital period of Earth is much longer tha...,The orbital period of a comet is more predicta...
5,100006,A teacher builds a model of a hydrogen atom. A...,B,number of particles,relative mass of particles,types of particles present,charges of particles present
6,100007,Which substance should a student apply to the ...,A,water,vinegar,salt,formaldehyde
7,100008,What is the main source of energy for the wate...,A,the Sun,fossil fuels,clouds,the ocean
8,100009,Which has the greatest effect on aiding the mo...,D,tension,friction,density,gravity
9,100010,"Over time, non-volcanic mountains can form due...",C,oceanic plates colliding with oceanic plates,oceanic plates separating from oceanic plates,continental plates colliding with continental ...,continental plates separating from continental...


In [266]:
q = train.ix[2][1]
print q

When two nuclei are combined into one nucleus, there is a slight change in mass and the release of a large amount of energy. What is this process called?


In [267]:
close_documents = get_close_documents(q, df_wiki_vector, dictionary,50)
print close_documents

1178    0.289175
211     0.292844
406     0.294458
1177    0.294606
1256    0.294959
1280    0.295250
1282    0.295251
230     0.295251
409     0.295251
1257    0.295750
1797    0.295750
900     0.295750
1082    0.296075
343     0.296567
666     0.296780
1366    0.296808
431     0.296808
108     0.296845
640     0.296887
408     0.297185
404     0.297185
491     0.297445
1173    0.297445
1430    0.297445
1464    0.297445
1176    0.297445
2098    0.297445
1894    0.297445
819     0.297445
159     0.297805
157     0.297805
327     0.297899
106     0.297977
1723    0.297977
1334    0.297977
109     0.297977
1686    0.297977
125     0.297977
422     0.297977
984     0.297977
1524    0.297977
1904    0.297977
113     0.297977
114     0.297977
727     0.297977
112     0.297977
754     0.298265
736     0.298601
1051    0.298629
1557    0.298635
dtype: float64


In [268]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
outputpath = '/Users/MK/GitHub/the_answer_is/data'
print_names_of_close_documents(wiki_path, close_documents,outputpath)

mass_vs_weight_in_physical_science.txt_to_unicode_remove_stopwords.txt
calculating_atomic_mass.txt_to_unicode_remove_stopwords.txt
conservation_of_mass_in_chemical_reactions.txt_to_unicode_remove_stopwords.txt
mass_vs_weight.txt_to_unicode_remove_stopwords.txt
molality.txt_to_unicode_remove_stopwords.txt
momentum.txt_to_unicode_remove_stopwords.txt
momentum_in_physical_science.txt_to_unicode_remove_stopwords.txt
cancer_in_life_science.txt_to_unicode_remove_stopwords.txt
conservation_of_momentum_in_one_dimension.txt_to_unicode_remove_stopwords.txt
molar_mass.txt_to_unicode_remove_stopwords.txt
si_mass_and_weight_units.txt_to_unicode_remove_stopwords.txt
heats_of_vaporization_and_condensation.txt_to_unicode_remove_stopwords.txt
large_intestine.txt_to_unicode_remove_stopwords.txt
CK12original_text.txt_to_unicode_remove_stopwords.txt
enthalpy.txt_to_unicode_remove_stopwords.txt
nuclear_fusion_in_physical_science.txt_to_unicode_remove_stopwords.txt
cooling_systems_in_physical_science.txt_to