In [2]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences(path) )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn, path):
    #string = unicode(string, encoding='utf-8', errors='replace').lower()
    #print string
    string = string_stemmer(string)
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    #print first_vector
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
        lengthlist = lengthlist.sort_values().head(topn)
        #now we have a topn index of the close documents. 

    namelist = pd.DataFrame(columns = ['name', 'content'])
    namelist['distance'] = lengthlist
    for i in lengthlist.index:
        name, text = get_document_by_index(path,i)
        namelist.set_value(i, 'name', name )
        namelist.set_value(i, 'content', text)
           
    return namelist


def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None



def string_stemmer(line):
    import re
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    line = re.sub(r'[^a-zA-Z ]',r'',line)
    line = line.split()
    line = [word for word in line if word not in stopwords.words('english')]  # remove the stop words. 
    output = []
    for word in line:
        output.append(stemmer.stem(word))     #stem all words 
    output = ' '.join(output)           # join the list to make a string
    return output






stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']



In [4]:
#get close document names for each question. 

#setup
wiki_folder_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_stemmed_all_merged'
train_file_path = '/Users/MK/GitHub/the_answer_is/data/training_set.tsv'
dictionary_folder_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  #this folder contains one file which is stemmed. 

def initialize_run_close_document(wiki_folder_path, dictionary_folder_path):
    dictionary = get_dictionary(dictionary_folder_path)
    wiki_tuple = get_document_tuple( wiki_folder_path, dictionary )
    df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary) 
    return dictionary, df_wiki_vector

def run_close_document( main_text , wiki_folder_path, dictionary, df_wiki_vector, topn):
    close_document =  get_close_documents(main_text, df_wiki_vector, dictionary, topn, wiki_folder_path)
    return close_document

dictionary, df_wiki_vector = initialize_run_close_document(wiki_folder_path, dictionary_folder_path)



In [74]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

#train the model using wikipdeida data                
sentences = MySentences('/Users/MK/GitHub/the_answer_is/data/wikipedia_from_all_ck_words_stemmed') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences,size=300, window=5, min_count=1, workers=4)
#procedure needed for deleting words not in the training set. 
def only_the_words_in_index( list, index ):
    output = []
    for a, s in enumerate(list):
        if s in index:
            output.append(list[a])
    return output

def string_stemmer(line):
    import re
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    line = re.sub(r'[^a-zA-Z ]',r'',line)
    line = line.split()
    line = [word for word in line if word not in stopwords.words('english')]  # remove the stop words. 
    output = []
    for word in line:
        output.append(stemmer.stem(word))     #stem all words 
    output = ' '.join(output)           # join the list to make a string
    return output

def split_result(df):
    split = []
    for sentence in df['content'].iloc[0].split('\n'):
        split = split + sentence.split()
    return split



In [71]:
train = pd.read_table('/Users/MK/GitHub/the_answer_is/data/training_set_which.tsv',sep = '\t')
i = 3
close = run_close_document( train.ix[i][1] , wiki_folder_path, dictionary, df_wiki_vector, 1)
print train.ix[i][1]
close

Some areas of the world are experiencing more desert-like conditions. This change most favors survival of species with the ability to do which of the following?


Unnamed: 0,name,content,distance
6660,snapdragon.txt_to_unicode_remove_stopwords_and...,antirrhinum genus plant common known dragon fl...,0.695487


In [79]:
train = pd.read_table('/Users/MK/GitHub/the_answer_is/data/training_set_which.tsv',sep = '\t')
myanswer_list = []
myanswer_distance = pd.DataFrame(np.zeros(4).reshape(1,4), columns = ['A','B','C','D'])    #initialize dataframe to store my answers distance
correct = 0
length = len(train)
for i in xrange(len(train)):
    close = run_close_document( train.ix[i][1] , wiki_folder_path, dictionary, df_wiki_vector, 1)
    close = only_the_words_in_index( split_result(close), model.index2word)
    
    q = train.ix[i][1]
    a1 = train.ix[i][3]
    a2 = train.ix[i][4]
    a3 = train.ix[i][5]
    a4 = train.ix[i][6]

    q = string_stemmer(q)
    a1 = string_stemmer(a1)
    a2 = string_stemmer(a2)
    a3 = string_stemmer(a3)
    a4 = string_stemmer(a4)

    q  = q.split()
    a1 = a1.split()
    a2 = a2.split()
    a3 = a3.split()
    a4 = a4.split()

    q =  only_the_words_in_index( q, model.index2word)
    a1 = only_the_words_in_index( a1, model.index2word)
    a2 = only_the_words_in_index( a2, model.index2word)
    a3 = only_the_words_in_index( a3, model.index2word)
    a4 = only_the_words_in_index( a4, model.index2word)
    
    a1 = q + a1
    a2 = q + a2
    a3 = q + a3
    a4 = q + a4
    
    answer_similarity = np.array([['A',model.n_similarity(close, a1)],['B',model.n_similarity(close, a2)],
                                  ['C',model.n_similarity(close, a3)],['D',model.n_similarity(close, a4)]])
    
    myanswer_distance.set_value(i, 'A', answer_similarity[0,1] )    #write down distance for each choice
    myanswer_distance.set_value(i, 'B', answer_similarity[1,1] )
    myanswer_distance.set_value(i, 'C', answer_similarity[2,1] )
    myanswer_distance.set_value(i, 'D', answer_similarity[3,1] )
    
    #myanswer_index = answer_similarity[:,1].argsort()[-2]
    myanswer_index = answer_similarity[:,1].argmax()          #get the maximum similarity 
    myanswer = answer_similarity[myanswer_index][0]
    myanswer_list.append(myanswer)
    print 'question: ',q
    print 'answer: ','A: ', train.ix[i][3],' B:',  train.ix[i][4], ' C: ' ,train.ix[i][5], 'D:',  train.ix[i][6]
    print 'correct answer: ', train.ix[i][2]
    print 'my answer: ', myanswer
    if train.ix[i][2] == myanswer:
        correct = correct +1.0
    print 'percent correct: ', correct / (i+1) 
    print 'progress: ', i, '/', length 
#for printing out the distance
# myanswer_distance.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/pure_ck12_word2vec_distance.csv', encoding='utf-8')    

#train['ck_12_word2vec_answer'] = myanswer_list

question:  [u'achromatopsia', u'genet', u'disord', u'suffer', u'color', u'vision', u'achromatopsia', u'most', u'like', u'affect', u'follow', u'structur', u'eye']
answer:  A:  rod cells  B: cone cells  C:  lens D: iris
correct answer:  B
my answer:  B
percent correct:  1.0
progress:  0 / 307
question:  [u'brain', u'spinal', u'cord', u'nerv', u'organ', u'perform', u'primari', u'function']
answer:  A:  supporting the body and enabling it to move  B: transporting oxygen, wastes, and nutrients throughout the body  C:  producing male and female gametes D: conducting messages to coordinate body functions
correct answer:  D
my answer:  A
percent correct:  0.5
progress:  1 / 307
question:  [u'research', u'team', u'want', u'produc', u'smaller', u'varieti', u'german', u'shepherd', u'dog', u'mate', u'smallest', u'dog', u'differ', u'litter', u'sever', u'generat', u'exampl', u'concept']
answer:  A:  extinction  B: mutation  C:  selective breeding D: natural selection
correct answer:  C
my answer:  A



KeyboardInterrupt: 

In [73]:
answer_similarity = np.array([['A',model.n_similarity(q, a1)],['B',model.n_similarity(q, a2)],
                              ['C',model.n_similarity(q, a3)],['D',model.n_similarity(q, a4)]])
answer_similarity

array([['A', '0.163910336443'],
       ['B', '0.301012949472'],
       ['C', '0.100897692365'],
       ['D', '0.218709438074']], 
      dtype='|S14')

In [70]:
train

Unnamed: 0,100005,In which way is the orbit of a comet different from the orbit of Earth?,B,The orbit of Earth is less circular than the orbit of a comet.,The orbit of a comet is more elliptical than the orbit of Earth.,The orbital period of Earth is much longer than the orbital period of a comet.,The orbital period of a comet is more predictable than the orbital period of Earth.
0,100015,Achromatopsia is a genetic disorder in which s...,B,rod cells,cone cells,lens,iris
1,100024,"The brain, spinal cord, and nerves are organs ...",D,supporting the body and enabling it to move,"transporting oxygen, wastes, and nutrients thr...",producing male and female gametes,conducting messages to coordinate body functions
2,100037,A research team wanted to produce a smaller va...,C,extinction,mutation,selective breeding,natural selection
3,100039,Some areas of the world are experiencing more ...,B,consume large amounts of food,become dormant for long periods,sense infrared and ultraviolet light,transpire water through the skin to the atmosp...
4,100043,With climate and the transfer of energy as hea...,A,"conduction, convection, radiation","reflection, retention, radiation","convection, nuclear, reflection","conduction, solar, nuclear"
5,100051,A family owns a vacation cabin located on a hi...,C,Water is pumped to the cabin from a distant re...,The family brings several gallons of bottled s...,Groundwater is drawn up in buckets from a priv...,The family collects water in rain barrels left...
6,100056,"The plant, Arabidopsis thaliana, has a gene wh...",D,The gene codes for flower color in the plant.,The gene is controlled by environment and not ...,Traits associated with this gene are beneficia...,Traits associated with this gene may vary grea...
7,100064,Trees most likely change the environment in wh...,D,releasing nitrogen in the soil.,crowding out non-native species.,adding carbon dioxide to the atmosphere.,removing water from the soil and returning it ...
8,100066,Blood pressure is often used as an indicator o...,B,cmHg,mmHg,g/cm^3,g/mm^3
9,100068,Frost wedging occurs when rocks are broken int...,A,weathering,transportation,deposition,lithification
