### Import custom modules from current folder

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import nltk
from text_easability_metrics import TextEasabilityMetrics, StanfordNLP
from simple_text_representation.classes import Text
from simple_text_representation.models import Database
from nltk.tree import Tree
import pandas as pd
import numpy as np
# from nltk.draw.tree import draw_trees

In [4]:
database = Database('educationalTexts', 'postgres', '', '0.0.0.0', 5432)
path = r'/Users/herbert/Projects/Tesis/stanford-corenlp-full-2017-06-09'
path = r'http://corenlp.run'
path = r'http://localhost/'

##### StanfordNLP test

In [5]:
stanfordNLP = StanfordNLP(language='es')

2018-06-17 19:10:37,180 : INFO : Using an existing server http://localhost:9000
2018-06-17 19:10:38,190 : INFO : The server is available.


### Test the metric in all the text

In [6]:
dfSeventh = pd.read_csv('../files/textsSeventhgrade.csv')
dfEighth = pd.read_csv('../files/textsEighthgrade.csv')
dfNinth = pd.read_csv('../files/textsNinthgrade.csv')
dfTenth = pd.read_csv('../files/textsTenthGrade.csv')
dfEleventh = pd.read_csv('../files/textsEleventhgrade.csv')

In [7]:
def getSentecesById(df, uniqueIndex):
    sentenceGrouped = df.where(df['id'] == uniqueIndex)
    return sentenceGrouped[pd.notnull(sentenceGrouped['id'])]

In [8]:
sentenceGrouped = dfSeventh.where(dfSeventh['id'] == 3)
clean_sentence_grouped = sentenceGrouped[pd.notnull(sentenceGrouped['id'])]
clean_sentence_grouped.index

Int64Index([24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], dtype='int64')

In [9]:
dfEleventh.id.unique()

array([165, 166, 169, 167, 168, 170, 171, 172, 173, 174, 176, 175, 178,
       180, 183, 185, 187, 190, 192, 194, 196, 198, 188, 181, 197, 177,
       179, 186, 184, 193, 195, 189, 191, 182])

In [10]:
groupedSeventh = dfSeventh.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
groupedEighth = dfEighth.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
groupedNinth = dfNinth.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
groupedTenth = dfTenth.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
groupedEleventh = dfEleventh.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))

In [11]:
len(groupedSeventh.values)

48

In [12]:
textOfSeventhGrade = groupedSeventh.values
textOfEightGrade = groupedEighth.values
textOfNineGrade = groupedNinth.values
textOfTenthGrade = groupedTenth.values
textOfEleventhGrade = groupedEleventh.values

### Building LSA Model

In [13]:
import re
from gensim import corpora, models, similarities
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

2018-06-17 13:56:21,861 : INFO : 'pattern' package not found; tag filters are not available for English


In [14]:
def extractVerbsFromText(text):
    pos_tags = stanfordNLP.pos(text)
    verbs = [pos_tag[0] for pos_tag in pos_tags if pos_tag[1].startswith('v')]
    return verbs

In [15]:
NUM_TOPICS = 2
STOPWORDS = stopwords.words('spanish')

In [16]:
def clean_text(text):
    return extractVerbsFromText(text)
#     tokenized_text = word_tokenize(text.lower())
#     cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
#     return cleaned_text

In [17]:
def clean_texts(data):
    tokenized_data = []
    for text in data:
        tokenized_data.append(clean_text(text))
    return tokenized_data

In [18]:
def generateModalForGrade(sentencesOfGrade):
    clean_data = clean_texts(sentencesOfGrade)

    # Build a Dictionary - association word to numeric id
    dictionary = corpora.Dictionary(clean_data)

    # Transform the collection of texts to a numerical form
    corpus = [dictionary.doc2bow(text) for text in clean_data]

    # Have a look at how the 20th document looks like: [(word_id, count), ...]
#     print(corpus[20])
#     [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

    # Build the LDA model
#     lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

    # Build the LSI model
    lsi_model = models.LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    
    return lsi_model, dictionary, corpus

In [19]:
model, dic, corp = generateModalForGrade(dfSeventh['value'].values)

2018-06-17 13:56:27,162 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,143 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,172 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,245 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,281 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,317 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,333 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:28,358 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:29,932 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:29,941 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:29,950 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:29,958 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:29,968 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:29,975 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:29,995 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,042 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:30,936 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,944 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,953 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,960 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,972 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,979 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,989 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:30,997 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:32,088 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,098 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,111 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,126 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,146 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,159 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,173 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:32,214 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:33,289 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,307 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,325 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,339 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,351 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,362 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,380 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:33,390 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:34,510 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,528 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,538 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,557 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,573 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,586 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,614 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:34,629 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:35,968 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:35,986 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:35,998 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:36,015 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:36,024 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:36,030 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:36,037 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:36,047 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:37,098 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,122 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,131 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,143 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,160 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,198 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,232 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:37,248 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:38,519 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,534 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,549 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,564 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,581 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,600 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,607 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:38,642 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:39,413 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,423 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,435 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,443 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,452 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,461 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,469 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:39,490 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:40,369 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,385 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,397 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,407 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,418 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,427 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,437 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:40,452 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

2018-06-17 13:56:41,325 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,334 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,353 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,365 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,376 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,385 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,401 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56:41,409 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:56

In [111]:
corp[0]
# Texto 0
    # Oraciones -> [0,1]
    # Oraciones
    # 0 -> [0,1,2,3,4,5,6,7,8]
    # 1 -> [2,9,10,11]
# Texto 1
    # Oraciones -> [2,3]
    # Oraciones
    # 2 -> [0,1,2,3,4,5,6,7,8]
    # 3 -> [2,9,10,11]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]

In [95]:
dfSeventh['value'].values[0]

'Si bien los trasplantes se han convertido en una práctica habitual, aún persisten fuertes temores en la población para donar órganos, lograr su superación es la clave para aumentar el número de los donadores solidarios que hacen falta para salvar miles de vidas.'

In [21]:
clean_text(dfSeventh['value'].values[0])

2018-06-17 13:56:46,678 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}


['han',
 'convertido',
 'persisten',
 'donar',
 'lograr',
 'es',
 'aumentar',
 'hacen',
 'salvar']

In [35]:
dic[616]

'existen'

In [23]:
dfSeventh['value'].values[0]

'Si bien los trasplantes se han convertido en una práctica habitual, aún persisten fuertes temores en la población para donar órganos, lograr su superación es la clave para aumentar el número de los donadores solidarios que hacen falta para salvar miles de vidas.'

In [24]:
textOfSeventhGrade[0]
text_tokens = nltk.word_tokenize(dfSeventh['value'].values[0])
pos_tags = stanfordNLP.pos(dfSeventh['value'].values[0])
verbs = [pos_tag[0] for pos_tag in pos_tags if pos_tag[1].startswith('v')]
# verbs

2018-06-17 13:56:54,785 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}


In [25]:
temp_verbs = extractVerbsFromText(dfSeventh['value'].values[0])
temp_verbs

2018-06-17 13:56:55,888 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}


['han',
 'convertido',
 'persisten',
 'donar',
 'lograr',
 'es',
 'aumentar',
 'hacen',
 'salvar']

In [92]:
test_verb = "han"
test_verb2 = "existen"
vec_bow = dic.doc2bow(test_verb.lower().split())
vec_bow2 = dic.doc2bow(test_verb2.lower().split())

# convert the query to LSI space
vec_lsi = model[vec_bow]
vec_lsiV = [element[1] for element in vec_lsi]
vec_lsi2 = model[vec_bow2]
vec_lsi2V = [element[1] for element in vec_lsi2]
index = similarities.MatrixSimilarity(model[corp])

# perform a similarity query against the corpus
sims = index[vec_lsi]
# sims = sorted(enumerate(sims), key=lambda item: -item[1])
# print(vec_lsi)
# print(vec_lsi2)


# np.dot(vec_lsiV, vec_lsi2V)/(np.linalg.norm(vec_lsiV) * np.linalg.norm(vec_lsi2V))
# 0.55165374
# corp
corp[0]


2018-06-17 17:42:53,627 : INFO : creating matrix with 746 documents and 2 features


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]

In [29]:
for text_data in textOfSeventhGrade:
    vebs = extractVerbsFromText(text_data)
    for verb in verbs:
        vec_bow = dic.doc2bow(verb.lower().split())
        
        # convert the query to LSI space
        vec_lsi = model[vec_bow]
        index = similarities.MatrixSimilarity(model[corp])

        # perform a similarity query against the corpus
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

2018-06-17 13:57:19,103 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:19,162 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,211 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,259 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,302 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,349 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,397 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,444 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,489 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,530 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:19,554 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06

2018-06-17 13:57:21,130 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:21,247 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,305 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,358 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,399 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,445 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,498 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,545 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,594 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,647 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:21,685 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06

2018-06-17 13:57:23,123 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,157 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:23,215 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,263 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,313 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,351 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,401 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,457 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,509 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,565 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,619 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:23,658 : INFO : {'properties':

2018-06-17 13:57:25,274 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,296 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:25,526 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,576 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,613 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,662 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,697 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,742 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,791 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,844 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,892 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:25,916 : INFO : {'properties':

2018-06-17 13:57:27,484 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,517 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:27,664 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,700 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,749 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,797 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,848 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,899 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,938 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:27,982 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:28,029 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:28,061 : INFO : {'properties':

2018-06-17 13:57:29,772 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:29,805 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:29,970 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,019 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,068 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,119 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,168 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,218 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,260 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,310 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,361 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:30,394 : INFO : {'properties':

2018-06-17 13:57:32,082 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,114 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:32,368 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,413 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,462 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,508 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,551 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,592 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,638 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,687 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,738 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:32,772 : INFO : {'properties':

2018-06-17 13:57:34,441 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,474 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:34,544 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,586 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,634 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,683 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,733 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,774 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,823 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,865 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,901 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:34,931 : INFO : {'properties':

2018-06-17 13:57:37,036 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,068 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:37,248 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,294 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,343 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,392 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,438 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,487 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,533 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,582 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,631 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:37,659 : INFO : {'properties':

2018-06-17 13:57:39,072 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,105 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:39,215 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,254 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,304 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,349 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,388 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,427 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,469 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,513 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,561 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:39,594 : INFO : {'properties':

2018-06-17 13:57:41,160 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,193 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:41,337 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,388 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,437 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,485 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,520 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,573 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,615 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,663 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,713 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:41,747 : INFO : {'properties':

2018-06-17 13:57:43,251 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,288 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-17 13:57:43,403 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,448 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,497 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,549 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,590 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,639 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,691 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,742 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,791 : INFO : creating matrix with 746 documents and 2 features
2018-06-17 13:57:43,825 : INFO : {'properties':

2018-06-17 13:57:45,138 : INFO : creating matrix with 746 documents and 2 features
