### Import custom modules from current folder

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import nltk
from text_easability_metrics import TextEasabilityMetrics, StanfordNLP
from simple_text_representation.classes import Text
from simple_text_representation.models import Database
from nltk.tree import Tree
import pandas as pd
import numpy as np
# from nltk.draw.tree import draw_trees

In [4]:
database = Database('educationalTexts', 'postgres', '', '0.0.0.0', 5432)
path = r'/Users/herbert/Projects/Tesis/stanford-corenlp-full-2017-06-09'
path = r'http://corenlp.run'
path = r'http://localhost/'

##### StanfordNLP test

In [5]:
stanfordNLP = StanfordNLP(language='es')

2018-06-18 01:54:24,414 : INFO : Using an existing server http://localhost:9000
2018-06-18 01:54:25,422 : INFO : The server is available.


### Test the metric in all the text

In [6]:
dfSeventh = pd.read_csv('../Data/textsSeventhgrade.csv')
dfEighth = pd.read_csv('../Data/textsEighthgrade.csv')
dfNinth = pd.read_csv('../Data/textsNinthgrade.csv')
dfTenth = pd.read_csv('../Data/textsTenthGrade.csv')
dfEleventh = pd.read_csv('../Data/textsEleventhgrade.csv')

In [7]:
def getSentecesById(df, uniqueIndex):
    sentenceGrouped = df.where(df['id'] == uniqueIndex)
    return sentenceGrouped[pd.notnull(sentenceGrouped['id'])]

In [12]:
clean_sentence_grouped = getSentecesById(dfSeventh, 3)
list(clean_sentence_grouped.index)

[24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]

In [32]:
texts_indexs = list()
for uniq_index in dfSeventh.id.unique():
    clean_sentence_grouped = getSentecesById(dfSeventh, uniq_index)
    texts_indexs.append(list(clean_sentence_grouped.index))

[0, 1, 2, 3, 4, 5, 6]

In [17]:
dfSeventh.id.unique()

array([ 1,  2,  3,  4,  5, 18,  6,  7,  8,  9, 10, 11, 12, 13, 14, 22, 15,
       16, 17, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 64, 33,
       34, 35, 36, 37, 55, 38, 39, 51, 40, 41, 42, 43, 44, 45])

In [18]:
# groupedSeventh = dfSeventh.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
# groupedEighth = dfEighth.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
# groupedNinth = dfNinth.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
# groupedTenth = dfTenth.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))
# groupedEleventh = dfEleventh.groupby('id').apply(lambda x: "%s" % ''.join(x['value']))

In [19]:
# len(groupedSeventh.values)

In [20]:
# textOfSeventhGrade = groupedSeventh.values
# textOfEightGrade = groupedEighth.values
# textOfNineGrade = groupedNinth.values
# textOfTenthGrade = groupedTenth.values
# textOfEleventhGrade = groupedEleventh.values

### Building LSA Model

In [21]:
import re
from gensim import corpora, models, similarities
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

2018-06-18 01:58:03,393 : INFO : 'pattern' package not found; tag filters are not available for English


In [22]:
def extractVerbsFromText(text):
    pos_tags = stanfordNLP.pos(text)
    verbs = [pos_tag[0] for pos_tag in pos_tags if pos_tag[1].startswith('v')]
    return verbs

In [23]:
NUM_TOPICS = 2
STOPWORDS = stopwords.words('spanish')

In [24]:
def clean_text(text):
    return extractVerbsFromText(text)
#     tokenized_text = word_tokenize(text.lower())
#     cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
#     return cleaned_text

In [25]:
def clean_texts(data):
    tokenized_data = []
    for text in data:
        tokenized_data.append(clean_text(text))
    return tokenized_data

In [26]:
def generateModalForGrade(sentencesOfGrade):
    clean_data = clean_texts(sentencesOfGrade)

    # Build a Dictionary - association word to numeric id
    dictionary = corpora.Dictionary(clean_data)

    # Transform the collection of texts to a numerical form
    corpus = [dictionary.doc2bow(text) for text in clean_data]

    # Have a look at how the 20th document looks like: [(word_id, count), ...]
#     print(corpus[20])
#     [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

    # Build the LDA model
#     lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

    # Build the LSI model
    lsi_model = models.LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    
    return lsi_model, dictionary, corpus

In [27]:
model, dic, corp = generateModalForGrade(dfSeventh['value'].values)

2018-06-18 01:58:07,503 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:07,832 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:07,870 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:07,927 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:07,966 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:08,012 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:08,043 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:08,086 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:09,715 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,727 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,742 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,749 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,761 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,774 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,811 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:09,856 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:11,721 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,729 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,760 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,770 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,783 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,791 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,809 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:11,821 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:13,559 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,568 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,578 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,590 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,604 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,613 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,623 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:13,651 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:14,474 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,488 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,503 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,513 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,523 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,532 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,542 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:14,550 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:15,407 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,420 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,428 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,441 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,453 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,464 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,491 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:15,502 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:16,624 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,642 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,654 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,666 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,674 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,680 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,688 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:16,699 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:17,577 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,588 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,595 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,605 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,619 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,644 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,663 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:17,673 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:18,647 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,661 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,673 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,683 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,694 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,712 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,719 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:18,747 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:19,347 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,356 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,368 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,375 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,383 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,389 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,395 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:19,408 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:20,152 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,163 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,174 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,182 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,189 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,196 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,202 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,211 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

2018-06-18 01:58:20,824 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,833 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,845 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,854 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,862 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,869 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,881 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58:20,888 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}
2018-06-18 01:58

## Nueva estructura de un texto 

### Texto 0
    - Oraciones -> [0,1]
    - Oraciones
        - 0 -> [0,1,2,3,4,5,6,7,8]
        - 1 -> [2,9,10,11]

### Texto 1
    - Oraciones -> [2,3]
    - Oraciones
        - 2 -> [0,1,2,3,4,5,6,7,8]
        - 3 -> [2,9,10,11] 

In [70]:
words_in_texts = list()

for text in texts_indexs:
    words_per_senteces_in_texts = list()
    for sentence_index in text:
        corp_sentence = corp[sentence_index]
        words_per_senteces_in_texts.append([corp_word[0] for corp_word in corp_sentence])
    words_in_texts.append(words_per_senteces_in_texts)
words_in_texts[0]

[[0, 1, 2, 3, 4, 5, 6, 7, 8],
 [2, 9, 10, 11],
 [12, 13],
 [14, 15, 16],
 [2, 3, 17, 18, 19, 20, 21],
 [22, 23, 24, 25],
 [3, 8, 18, 26, 27, 28, 29, 30, 31, 32, 33, 34]]

In [79]:
results = list()
for text in words_in_texts:
    text_results = list()
    for index, sentece_in_text in enumerate(text):
        sentence_results = list()
        for word_in_sentence in sentece_in_text:
            current_word = dic[word_in_sentence]
            word_results = list()
            for j in range(index + 1, len(text)):
                next_sentence = text[j]
                for next_word in next_sentence:
                    current_vec = dic.doc2bow(current_word.lower().split())
                    next_vec = dic.doc2bow(dic[next_word].lower().split())

                    # Convert the query to LSI space
                    current_vec_lsi = model[current_vec]
                    next_vec_lsi = model[next_vec]
                    if (len(current_vec_lsi) > 0 and len(next_vec_lsi) > 0):
                        current_vec_lsi = [element[1] for element in current_vec_lsi]
                        next_vec_lsi = [element[1] for element in next_vec_lsi]
                    
                        # Calculate Similarity
                        word_results.append(np.dot(current_vec_lsi, next_vec_lsi)/(np.linalg.norm(current_vec_lsi) * np.linalg.norm(next_vec_lsi)))
                    else:
                        word_results.append(0)
            sentence_results.append(sum(word_results)/len(word_results) if len(word_results) > 0 else 0)
        text_results.append(sum(sentence_results)/len(sentence_results) if len(sentence_results) > 0 else 0)
    results.append(sum(text_results)/len(text_results) if len(text_results) > 0 else 0)
results

[0.4011460644212214,
 0.590108599123644,
 0.4163906342013442,
 0.4372300372229645,
 0.1146642129831157,
 0.06424053507882084,
 0.20485500763594272,
 0.3477611273881205,
 0.21603184393295383,
 0.4370556567291295,
 0.4626205581090414,
 0.565705548292251,
 0.4466570932452763,
 0.2792600938665837,
 0.47606227843350113,
 0.05111290287236445,
 -0.04168027034666077,
 -0.05165678678770459,
 0.3522444221782628,
 0.14818030195790932,
 0.15905526930470337,
 0.15235477121946317,
 0.29919384083487,
 0.42954486437184547,
 0.46436216854690704,
 0.37578727037793397,
 0.010927111023327402,
 0.3509686613921357,
 0.19891511990698707,
 0.0,
 0.1707214208681657,
 0.17505362966641738,
 0.0,
 0.0,
 0.11731481148192545,
 0.16527456216018474,
 -0.01978858721849033,
 0.3331916393187502,
 0.0,
 0.5373483611115548,
 0.1292955548446629,
 0.06872285917251256,
 0.35536667278644346,
 0.30805620340348894,
 0.5716959244758882,
 0.5267404392495103,
 0.5541788030964976,
 0.6237311758614323]

In [95]:
dfSeventh['value'].values[0]

'Si bien los trasplantes se han convertido en una práctica habitual, aún persisten fuertes temores en la población para donar órganos, lograr su superación es la clave para aumentar el número de los donadores solidarios que hacen falta para salvar miles de vidas.'

In [21]:
clean_text(dfSeventh['value'].values[0])

2018-06-17 13:56:46,678 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}


['han',
 'convertido',
 'persisten',
 'donar',
 'lograr',
 'es',
 'aumentar',
 'hacen',
 'salvar']

In [35]:
dic[616]

'existen'

In [23]:
dfSeventh['value'].values[0]

'Si bien los trasplantes se han convertido en una práctica habitual, aún persisten fuertes temores en la población para donar órganos, lograr su superación es la clave para aumentar el número de los donadores solidarios que hacen falta para salvar miles de vidas.'

In [24]:
textOfSeventhGrade[0]
text_tokens = nltk.word_tokenize(dfSeventh['value'].values[0])
pos_tags = stanfordNLP.pos(dfSeventh['value'].values[0])
verbs = [pos_tag[0] for pos_tag in pos_tags if pos_tag[1].startswith('v')]
# verbs

2018-06-17 13:56:54,785 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}


In [36]:
temp_verbs = extractVerbsFromText(dfSeventh['value'].values[0])
temp_verbs

2018-06-18 02:03:34,912 : INFO : {'properties': "{'annotators': 'pos', 'outputFormat': 'json'}", 'pipelineLanguage': 'es'}


['han',
 'convertido',
 'persisten',
 'donar',
 'lograr',
 'es',
 'aumentar',
 'hacen',
 'salvar']

In [41]:
test_verb = "han"
test_verb2 = "convertido"
vec_bow = dic.doc2bow(test_verb.lower().split())
vec_bow2 = dic.doc2bow(test_verb2.lower().split())

# convert the query to LSI space
vec_lsi = model[vec_bow]
vec_lsiV = [element[1] for element in vec_lsi]
vec_lsi2 = model[vec_bow2]
vec_lsi2V = [element[1] for element in vec_lsi2]
index = similarities.MatrixSimilarity(model[corp])

# perform a similarity query against the corpus
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
# print(vec_lsi)
# print(vec_lsi2)


np.dot(vec_lsiV, vec_lsi2V)/(np.linalg.norm(vec_lsiV) * np.linalg.norm(vec_lsi2V))
# 0.55165374
# corp
# sims


2018-06-18 02:06:55,541 : INFO : creating matrix with 746 documents and 2 features


0.590906854047795

In [40]:
for text_data in textOfSeventhGrade:
    vebs = extractVerbsFromText(text_data)
    for verb in verbs:
        vec_bow = dic.doc2bow(verb.lower().split())
        
        # convert the query to LSI space
        vec_lsi = model[vec_bow]
        index = similarities.MatrixSimilarity(model[corp])

        # perform a similarity query against the corpus
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

NameError: name 'textOfSeventhGrade' is not defined