In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install fuzzywuzzy[speedup]
!pip install plotly

In [1]:
import pandas as pd
import re
#import pymorphy2
import math
import spacy
from collections import Counter
import numpy as np
import scipy.spatial
import collections
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import plotly.plotly as py
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='MaximGilman', api_key='vlKaZaRyFpNaDPYD0hiv')


# Чтение данных
Сет распределен по нескольким файлам, поэтому необходимо их объединить.
Сет содержит данные о сложности вопроса по мнению спрашивающего и отвечающего и номер статьи, из которой взят ответ - нам это не нужно

In [2]:
answers1 = pd.read_csv('S08_question_answer_pairs.txt', sep='\t')
answers2 = pd.read_csv('S09_question_answer_pairs.txt', sep='\t')
answers3 = pd.read_csv('S10_question_answer_pairs.txt', sep='\t', encoding='latin-1')

answers = pd.concat([answers1, answers2, answers3])
answers.drop(labels=['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleFile'], axis=1, inplace=True)
answers.head(20)

Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Abraham_Lincoln,Did his mother die of pneumonia?,no
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.


In [3]:
print('Наличие null:', answers.isnull().values.any())
print('Наличие NaN:', answers.isna().values.any())

Наличие null: True
Наличие NaN: True


# Первичная обработка данных
## Обработка дубликатов и пустых значений

1. Сет содержит вопросы без ответа
2. Сет содержит повторяющиеся вопросы 

Дубликаты неоходимо удалить - они не несут дополнительной смысловой нагрузки, но увеличивают объем для обработки.
Пары с пустыми значениями (без ответа) не могут принеси пользы для модели + нельзя выполнить замену для пустых начени, поскольку необходимо знать правилный ответ. В результате остается только удалить такие вопросы.

Сначала удаляем вопросы без ответа, затем повторяющиеся вопросы

In [4]:
print('Размер начального сета', answers.size)

#предполагается, что нам необходимо заполнение всех полей
not_null_answers = answers.dropna(axis=0, how='any')
print('Размер сета без пустых значений', not_null_answers.size)

# при удалении дупликатов оставляем только первое вхождение
no_duplicates_answers = not_null_answers.drop_duplicates(subset='Question', keep='first')
print('Размер сета без дубликатов', no_duplicates_answers.size)
no_duplicates_answers.head(20)


Размер начального сета 11991
Размер сета без пустых значений 10263
Размер сета без дубликатов 6606


Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
4,Abraham_Lincoln,Did his mother die of pneumonia?,no
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832
10,Abraham_Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren..."
12,Abraham_Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell
14,Abraham_Lincoln,When did the Gettysburg address argue that Ame...,1776
16,Abraham_Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes
18,Abraham_Lincoln,Was Abraham Lincoln the first President of the...,No


In [5]:
print('Наличие null:', no_duplicates_answers.isnull().values.any())
print('Наличие NaN:', no_duplicates_answers.isna().values.any())

Наличие null: False
Наличие NaN: False


## Обработка притяжательных местоимений

1. В сете содержатся указания на одушевленный предмет статьи -his/her, he/she 
#####    (напр. Was his (Alessandro_Volta) 1800 paper written in French ?)
2. В сете содержатся указания на неодушевленный предмет статьи -this
#####    (напр. 	What connected the Akans to this (Ghana) Empire?)

Производится замена местоимений на тему, однако только в тех случаях, когда тема не содержится в вопросе и это местоимение является явным на неё указателем: 
##### (напр. "When did Lincoln begin his political career?").
В данном случае замена не производится.

In [6]:
nlp = spacy.load('en_core_web_sm')

#no_duplicates_answers['Question'] =no_duplicates_answers['Question'].replace(regex=[r' his | her | this '], 
#                                                                           value=' замена ')
exp = no_duplicates_answers.copy()
exp['Theme column'] = ''
for index, row in exp.iterrows():
    row['ArticleTitle'] = row['ArticleTitle'].replace('_', ' ')
        
    title_words = list(map(lambda x: x.lower(),row['ArticleTitle'].split()))
    
    #question_words = re.findall(r"[a-zA-Z]+-[a-zA-Z]+|[a-zA-Z]+\'[a-zA-Z]+|[a-zA-Z]+", row['Question'])
    
    doc = nlp(row['Question'])
    infinitives = [token.lemma_ for token in doc]
    '''
    infinitives = []
    morph = pymorphy2.MorphAnalyzer()
    for word in question_words:
        word = word.lower()
        infinitives.append(morph.parse(word)[0].normal_form)        
    '''    
    
    infinitives = list(map(lambda x: x.lower(), infinitives))
    has_title_in_question = any(map(lambda x: x in list(map(lambda x: x.lower(), infinitives)), title_words))
    
    if (not has_title_in_question):      
        '''
        print(title_words)
        print(infinitives)
        print(row['ArticleTitle'], row['Question'])
        s = row['Question'].lower()        
        print(re.sub(r'( he ?)|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip())
        ''' 
        s = row['Question'].lower()  
        row['Theme column'] = re.sub(r'( he )|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip()
        
exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,
4,Abraham Lincoln,Did his mother die of pneumonia?,no,did Abraham Lincoln mother die of pneumonia?
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",what did the legal tender act of 1862 establish?
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,when did the gettysburg address argue that ame...
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,


### Перенесем все неизмененные значения из столбца Question в Theme column. С последней и будем в дальнейшем работать

In [7]:
for index, row in exp.iterrows():
    #print(row['Theme column'])
    row['Theme column'] = row['Theme column'].lower() if row['Theme column'] != '' else row['Question'].lower() 
    row['Theme column'] =   row['Theme column'].replace('?','').strip() 
exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,was abraham lincoln the sixteenth president of...
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,did lincoln sign the national banking act of 1863
4,Abraham Lincoln,Did his mother die of pneumonia?,no,did abraham lincoln mother die of pneumonia
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,how many long was lincoln's formal education
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,when did lincoln begin his political career
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",what did the legal tender act of 1862 establish
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,who suggested lincoln grow a beard
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,when did the gettysburg address argue that ame...
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,did lincoln beat john c. breckinridge in the 1...
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,was abraham lincoln the first president of the...


# Перейдем к Токенизации для последующего TF-IDF и WB


In [8]:
text = ' '.join(exp['Theme column'])
tokens = re.findall(r"[\w']+", text.lower())
print('\nTokens:\n\n', tokens[:500])



Tokens:

 ['was', 'abraham', 'lincoln', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states', 'did', 'lincoln', 'sign', 'the', 'national', 'banking', 'act', 'of', '1863', 'did', 'abraham', 'lincoln', 'mother', 'die', 'of', 'pneumonia', 'how', 'many', 'long', 'was', "lincoln's", 'formal', 'education', 'when', 'did', 'lincoln', 'begin', 'his', 'political', 'career', 'what', 'did', 'the', 'legal', 'tender', 'act', 'of', '1862', 'establish', 'who', 'suggested', 'lincoln', 'grow', 'a', 'beard', 'when', 'did', 'the', 'gettysburg', 'address', 'argue', 'that', 'america', 'was', 'born', 'did', 'lincoln', 'beat', 'john', 'c', 'breckinridge', 'in', 'the', '1860', 'election', 'was', 'abraham', 'lincoln', 'the', 'first', 'president', 'of', 'the', 'united', 'states', 'did', 'lincoln', 'start', 'his', 'political', 'career', 'in', '1832', 'did', 'lincoln', 'ever', 'represent', 'alton', 'sangamon', 'railroad', 'which', 'county', 'was', 'lincoln', 'born', 'in', 'when', 'did', 'lincoln', 'fi

### Счетчик слов:


In [9]:
term_freq_dict = Counter(tokens)
term_freq_tuple = [(key, value) for key, value in term_freq_dict.items()]
sorted_count_words =sorted(term_freq_tuple, key=lambda x: x[1], reverse=True)

In [None]:
words = list(map(lambda x: x[0], sorted_count_words)) 
count = list(map(lambda x: x[1], sorted_count_words)) 
dict_count = dict(zip(words, count)) 

py.iplot([go.Bar(x=list(dict_count.keys()), y=list(dict_count.values()))])


### В данном случае датасет имеет большое количество слов, поэтому график нуждается в увеличении. 
### При увеличении графика обнаружено, что самыми распространенными словами являются стандартные стоп-слова. Однако в эту же выборку попали слова, являющиеся важными для формулировки вопроса. Например, when и where. Если мы будем считать их стоп-словами и будем удалять, то вопросы "Когда родился Линкольн?" и "Где родился Линкольн?" станут идентичными. То есть, необходимо внимательнее присмотреться к потенциальным стоп-словам

In [10]:
Q1, Q2, Q3 = np.percentile(list(term_freq_dict.values()), [25, 50, 75])
IQR = Q3 - Q1
lower_inner_fence = Q1 - (1.5 * IQR)
lower_outer_fence = Q1 - (3 * IQR)
upper_inner_fence = Q3 + (1.5 * IQR)
upper_outer_fence = Q3 + (3 * IQR)
print(f'''upper_outer_fence:  {upper_outer_fence}, 
upper_inner_fence:  {upper_inner_fence},
lower_inner_fence:  {lower_inner_fence},
lower_outer_fence"  {lower_outer_fence}''')

upper_outer_fence:  9.0, 
upper_inner_fence:  6.0,
lower_inner_fence:  -2.0,
lower_outer_fence"  -5.0


## Теперь немного подправим список стоп-слов (но только не значащие). 
### В нашем случае (в английском языке) — это напр. артикли.
### Было решено считать за стоп слова  все стандарнтые + слова с частотой при токенизации >50.
#### Но с некоторыми исключениями: мы не удаляем уточняющие слова - what/where/e.t.c. 
#### Более того, можно оставлять в списке только основную форму слова (напр. глагола become), поскольку данные в сете будут Лемматизированны.



#### Произведем изменение списка стоп-слов

In [11]:
my_stop_words = nlp.Defaults.stop_words
print('\nИзначальный набор\n',len(my_stop_words))
#добавление отловленных слов из списка токенизированных, кроме when/where, language 
my_stop_words|= {'the','of','is','a', 'was','to','are','and','do','for','does','have','as',
                 'an','are','many','he','she','it','on','his','her','by','with','that','this','most','at','there','short','long',
                '\'s','.','?','!',',','&'}
#удаление лишних слов.  Удаляем только одну форму - become (см выше)
my_stop_words-= {'amount', 'another', 'anything', 'become'}
'what',
'when',
'where',
'whereafter',
'which',
'who',
'whom',
'whose',
'why',


my_stop_words


Изначальный набор
 326


{'!',
 '&',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 ',',
 '.',
 '?',
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'any',
 'anyhow',
 'anyone',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'fu

### Удаляем стоп-слова из сета

In [12]:
for index, row in exp.iterrows():
    new_row = ''
    for word in  row['Theme column'].split():
        if word not in my_stop_words:
            new_row += ' '+word
    row['Theme column']=new_row
exp.head(50)




Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,abraham lincoln sixteenth president united st...
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,lincoln sign national banking act 1863
4,Abraham Lincoln,Did his mother die of pneumonia?,no,abraham lincoln mother die pneumonia
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,lincoln's formal education
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,lincoln begin political career
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",legal tender act 1862 establish
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,suggested lincoln grow beard
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,gettysburg address argue america born
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,lincoln beat john c. breckinridge 1860 election
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,abraham lincoln president united states


# МЕШОК СЛОВ

In [13]:
full_tokens = set(' '.join(tokens).split())
vectors = [[sentence.count(token) for token in tokens]
           for sentence in tokens]
print([(i, word) for i, word in enumerate(full_tokens)])
#vectors

[(0, 'millard'), (1, 'continental'), (2, 'christianity'), (3, 'rebellion'), (4, 'shield'), (5, 'ref'), (6, 'proto'), (7, 'crocodile'), (8, 'systema'), (9, 'meet'), (10, 'argue'), (11, 'derive'), (12, 'adulthood'), (13, '35'), (14, 'molt'), (15, 'vi'), (16, 'persistent'), (17, 'department'), (18, 'variations'), (19, 'michael'), (20, 'roma'), (21, 'chemical'), (22, 'village'), (23, 'charged'), (24, 'hit'), (25, 'drumhead'), (26, 'roosevelt'), (27, 'daffy'), (28, 'metamorphosis'), (29, 'resident'), (30, 'genes'), (31, 'mad'), (32, 'ketchup'), (33, 'lobsters'), (34, 'carrion'), (35, 'coastal'), (36, 'children'), (37, 'sandy'), (38, 'kingdom'), (39, 'always'), (40, 'paul'), (41, 'famous'), (42, '2007'), (43, 'st'), (44, "women's"), (45, 'rosie'), (46, 'august'), (47, 'purfling'), (48, '24403'), (49, 'administers'), (50, 'affiliation'), (51, 'academy'), (52, 'rule'), (53, 'follow'), (54, 'acre'), (55, 'taller'), (56, 'monas'), (57, 'believers'), (58, 'music'), (59, 'industrialized'), (60, 'p




In [14]:
test_sentence = vectors[1] #choose any
for i, sentence in enumerate(tokens):
    print(sentence)
    print(scipy.spatial.distance.cosine(test_sentence, vectors[i]))
    

was
0.30523138894637036
abraham
0.0
lincoln
1.0
the
0.998753227689618
sixteenth
0.9941564083816326
president
1.0
of
1.0
the
0.998753227689618
united
1.0
states
0.18378973776356955
did
1.0
lincoln
1.0
sign
1.0
the
0.998753227689618
national
0.04502281785309081
banking
0.35364556973928285
act
0.01287286519751174
of
1.0
1863
1.0
did
1.0
abraham
0.0
lincoln
1.0
mother
0.997945297763546
die
1.0
of
1.0
pneumonia
0.09494084655962542
how
0.996570431752036
many
0.18088980513486008
long
1.0
was
0.30523138894637036
lincoln's
1.0
formal
0.15458844372898195
education
0.14667051494907446
when
0.9963041881540179
did
1.0
lincoln
1.0
begin
0.9981056959123445
his
0.9982864054593203
political
0.11598553093908104
career
0.20653605786879525
what
0.3415796380162235
did
1.0
the
0.998753227689618
legal
0.010818488084989086
tender
1.0
act
0.01287286519751174
of
1.0
1862
1.0
establish
0.3858958627247495
who
0.9956326880614242
suggested
1.0
lincoln
1.0
grow
1.0
a
0.0014124300834635894
beard
0.05395369620260282
w

him
0.9843462342340553
who
0.9956326880614242
was
0.30523138894637036
the
0.998753227689618
first
1.0
to
1.0
perform
0.9977976639986453
and
0.23089371888793853
publish
0.9965168034095235
careful
0.20970781183867904
experiments
0.9963071650468457
aiming
0.3260595723262296
at
0.059520466820536355
the
0.998753227689618
definition
1.0
of
1.0
an
0.09934735764104985
international
0.1543384815052321
temperature
0.0901815193351212
scale
0.04502281785309081
on
1.0
scientific
1.0
grounds
1.0
the
0.998753227689618
celsius
1.0
crater
0.07167860824062511
on
1.0
the
0.998753227689618
moon
0.99681915161551
is
1.0
what
0.3415796380162235
is
1.0
the
0.998753227689618
celsius
1.0
crater
0.07167860824062511
on
1.0
the
0.998753227689618
moon
0.99681915161551
named
0.0564106046669407
after
0.03369858201928566
him
0.9843462342340553
had
0.019885182849877503
anders
0.2665179544046743
celsius
1.0
thermometer
0.997117201701911
100
1.0
for
1.0
the
0.998753227689618
freezing
1.0
point
1.0
of
1.0
water
0.08007987

0.10929117346891715
was
0.30523138894637036
coolidge
1.0
governor
1.0
who
0.9956326880614242
appointed
0.32505672150743425
harlan
0.02928966608452377
fiske
1.0
stone
1.0
to
1.0
the
0.998753227689618
supreme
0.9960064563346944
court
1.0
was
0.30523138894637036
coolidge
1.0
the
0.998753227689618
thirteenth
0.9848711876597693
president
1.0
of
1.0
the
0.998753227689618
united
1.0
states
0.18378973776356955
was
0.30523138894637036
calvin
0.32341412166155714
coolidge
1.0
republican
0.15970946898995664
was
0.30523138894637036
calvin
0.32341412166155714
coolidge
1.0
a
0.0014124300834635894
governor
1.0
of
1.0
massachus08_setts
0.18537318799908276
when
0.9963041881540179
was
0.30523138894637036
coolidge
1.0
born
0.9950092451282517
where
0.9961788709863583
did
1.0
coolidge's
1.0
grandfather
0.2815120094971624
had
0.019885182849877503
government
0.994739282453898
offices
1.0
which
0.989229291609159
state
0.10929117346891715
were
1.0
coolidge
1.0
born
0.9950092451282517
in
1.0
is
1.0
calvin
0.3234

0.998753227689618
entire
1.0
area
0.06695866318096244
subject
0.9938513228685528
to
1.0
donnacona
0.11488028216103463
chief
0.9904057333557558
at
0.059520466820536355
stadacona
0.017280461576130834
did
1.0
continental
0.35097836273471217
european
0.12610540859361075
immigrants
0.17142833276056446
not
1.0
s08_settle
1.0
the
0.998753227689618
prairies
0.07252910900713871
is
1.0
it
1.0
the
0.998753227689618
world
1.0
's
1.0
second
1.0
largest
0.08906289224343966
country
1.0
by
0.9949494734783132
total
0.21491058194634816
area
0.06695866318096244
what
0.3415796380162235
happened
0.0448481321139671
in
1.0
1867
1.0
are
0.20007627195421962
ducks
1.0
in
1.0
the
0.998753227689618
arctic
0.031771757737021566
northern
0.9987772319693097
hemisphere
0.9956753945197958
migratory
0.2595183418244741
is
1.0
a
0.0014124300834635894
drake
0.005625898538822338
a
0.0014124300834635894
male
0.017571446560694204
do
1.0
all
0.023947563915909353
ducks
1.0
quack
0.014914495376515013
what
0.3415796380162235
is
1

0.15907980196191218
weight
0.9913044574100541
at
0.059520466820536355
birth
0.9783095058908812
what
0.3415796380162235
did
1.0
aristotle
0.4290939846705861
say
0.03751797015987046
about
0.03260813131898477
elephants
0.15907980196191218
are
0.20007627195421962
elephant
0.1155790025828416
populations
0.1802941746079827
in
1.0
west
1.0
africa
0.007192254094797734
generally
0.04409430311538187
small
0.06259235185711698
and
0.23089371888793853
fragmented
0.01857648882219265
is
1.0
the
0.998753227689618
asian
0.06825305641016932
elephant
0.1155790025828416
larger
0.03465769036280053
than
0.13197839621281993
the
0.998753227689618
african
0.04571743100026138
is
1.0
an
0.09934735764104985
elephant's
0.15718227371684967
skin
1.0
tough
0.997321049467979
how
0.996570431752036
do
1.0
elephants
0.15907980196191218
communicate
0.0967977394036571
over
1.0
long
1.0
distances
0.4138451644726434
when
0.9963041881540179
do
1.0
african
0.04571743100026138
elephants
0.15907980196191218
lie
1.0
down
1.0
how


0.31404780739141536
congressman
0.23979261712638877
did
1.0
ford
1.0
attend
0.07925000111553226
the
0.998753227689618
university
1.0
of
1.0
michigan
0.12796184775602892
what
0.3415796380162235
positions
1.0
ford
1.0
played
0.03846576165498272
in
1.0
the
0.998753227689618
school
0.9932883890602011
football
0.031646986652023035
team
0.008384901769984632
what
0.3415796380162235
did
1.0
ford
1.0
say
0.03751797015987046
about
0.03260813131898477
his
0.9982864054593203
biological
0.038315507788716396
father
0.5022809897285145
who
0.9956326880614242
did
1.0
ford
1.0
nominate
0.3389999277567133
for
1.0
vice
1.0
president
1.0
was
0.30523138894637036
ford
1.0
active
0.025921382793525738
about
0.03260813131898477
vietnamese
0.09729267163520217
affairs
0.015930571463950827
had
0.019885182849877503
ford's
1.0
wife
1.0
married
0.019578452851842676
before
0.9969791322708849
is
1.0
ford
1.0
related
0.07925000111553226
with
0.9964915808020913
the
0.998753227689618
assassination
0.1566361160103018
of
1.

0.3415796380162235
genus
1.0
does
1.0
the
0.998753227689618
gray
0.008751230841811286
wolf
1.0
belong
0.9963737080638982
when
0.9963041881540179
do
1.0
wolves
1.0
molt
0.9846625045281552
why
0.9935569631655717
is
1.0
it
1.0
beneficial
0.0756250624389454
for
1.0
alpha
0.0016762201996277648
males
0.05282279934627465
and
0.23089371888793853
females
0.07206570853427019
to
1.0
forcefully
1.0
prevent
1.0
other
0.9987613855022589
wolves
1.0
from
0.9967665720470967
mating
0.3383859006180462
what
0.3415796380162235
type
1.0
of
1.0
tools
1.0
do
1.0
biologists
0.9983762546155113
use
1.0
to
1.0
capture
0.019963659862726746
wolves
1.0
for
1.0
tagging
0.32209137185859926
forward
0.15769842136201084
erect
1.0
ears
0.036567370392522
and
0.23089371888793853
slightly
0.9933586696454768
bristle
0.9982432351053008
hackles
0.04392304534227731
are
0.20007627195421962
a
0.0014124300834635894
sign
1.0
of
1.0
what
0.3415796380162235
in
1.0
wolves
1.0
was
0.30523138894637036
grover
1.0
cleveland
0.2549053795098

0.9972228168100655
or
1.0
a
0.0014124300834635894
metal
0.010448619091699474
screen
1.0
not
1.0
pierce
1.0
with
0.9964915808020913
a
0.0014124300834635894
cut
1.0
out
1.0
design
1.0
what
0.3415796380162235
happened
0.0448481321139671
in
1.0
1896
1.0
what
0.3415796380162235
happened
0.0448481321139671
with
0.9964915808020913
a
0.0014124300834635894
bromide
0.9890446460915394
emulsion
0.99743482839163
in
1.0
two
1.0
sheets
0.9964070284513188
of
1.0
very
1.0
thick
0.9891547529454406
black
0.020881741132133258
paper
0.02096420768880125
is
1.0
there
0.9987789553923376
a
0.0014124300834635894
becquerel
0.9955096411150537
crater
0.07167860824062511
on
1.0
the
0.998753227689618
moon
0.99681915161551
and
0.23089371888793853
a
0.0014124300834635894
becquerel
0.9955096411150537
crater
0.07167860824062511
on
1.0
mars
0.03520370071154888
are
0.20007627195421962
the
0.998753227689618
javanese
0.04058809609934011
the
0.998753227689618
largest
0.08906289224343966
and
0.23089371888793853
politically
0.

appointed
0.32505672150743425
to
1.0
secretary
0.050537650386353716
of
1.0
war
0.017953326796015023
when
0.9963041881540179
did
1.0
james
0.06951058403320343
monroe
0.9970884587076162
die
1.0
when
0.9963041881540179
did
1.0
james
0.06951058403320343
monroe
0.9970884587076162
graduate
0.02096420768880125
from
0.9967665720470967
william
0.03710477901047449
and
0.23089371888793853
mary
0.00421843342705841
when
0.9963041881540179
was
0.30523138894637036
james
0.06951058403320343
monroe
0.9970884587076162
elected
1.0
president
1.0
which
0.989229291609159
property
1.0
did
1.0
james
0.06951058403320343
monroe
0.9970884587076162
sell
1.0
in
1.0
1817
1.0
when
0.9963041881540179
did
1.0
james
0.06951058403320343
monroe
0.9970884587076162
introduce
1.0
the
0.998753227689618
monroe
0.9970884587076162
doctrine
1.0
who
0.9956326880614242
did
1.0
james
0.06951058403320343
monroe
0.9970884587076162
live
1.0
with
0.9964915808020913
in
1.0
new
1.0
york
1.0
city
1.0
what
0.3415796380162235
did
1.0
james


0.18537318799908276
sent
1.0
him
0.9843462342340553
in
1.0
1774
1.0
who
0.9956326880614242
did
1.0
massachus08_setts
0.18537318799908276
send
1.0
in
1.0
1774
1.0
are
0.20007627195421962
john
0.9907975027168932
adams
0.017258318534724038
last
0.13997763214682246
words
1.0
often
1.0
quoted
1.0
as
0.1303373685321596
thomas
0.13351628740724764
jefferson
1.0
survives
1.0
the
0.998753227689618
john
0.9907975027168932
adams
0.017258318534724038
library
0.01180162477885116
housed
0.9959008819123686
at
0.059520466820536355
the
0.998753227689618
boston
0.9977220424814706
public
0.9917359134803351
library
0.01180162477885116
contains
0.3589771744366551
what
0.3415796380162235
adams
0.017258318534724038
'
1.0
opponents
1.0
were
1.0
what
0.3415796380162235
did
1.0
the
0.998753227689618
election
1.0
of
1.0
1800
1.0
not
1.0
become
0.9916820752137528
a
0.0014124300834635894
bitter
0.9954440849629411
and
0.23089371888793853
volatile
0.07167860824062511
battle
0.0697076498717093
with
0.9964915808020913


0.06951058403320343
than
0.13197839621281993
the
0.998753227689618
other
0.9987613855022589
members
0.98957203523896
of
1.0
panthera
0.2524557772376197
is
1.0
a
0.0014124300834635894
leopard
0.024935971699646786
larger
0.03465769036280053
and
0.23089371888793853
less
1.0
lanky
0.10167162639931315
than
0.13197839621281993
a
0.0014124300834635894
cheetah
0.04650596312684774
are
0.20007627195421962
large
0.021961697292992244
pythons
0.9958511938183565
potential
0.021961697292992244
prey
1.0
for
1.0
leopards
0.06216598410949137
what
0.3415796380162235
may
0.008384901769984632
a
0.0014124300834635894
leopard
0.024935971699646786
be
0.9929001192059979
mistaken
0.3638868153870919
for
1.0
what
0.3415796380162235
is
1.0
a
0.0014124300834635894
hybrid
0.9769937567922329
animal
0.03205206331541188
resulting
1.0
from
0.9967665720470967
a
0.0014124300834635894
union
1.0
between
0.9946146458045795
a
0.0014124300834635894
leopard
0.024935971699646786
and
0.23089371888793853
a
0.0014124300834635894
pu

0.11484473844855003
is
1.0
liechtenstein
0.9981797741691383
doubly
0.9965414897347229
landlocked
0.24410634374239415
does
1.0
liechtenstein
0.9981797741691383
have
0.11337070701689356
an
0.09934735764104985
army
0.008384901769984632
when
0.9963041881540179
was
0.30523138894637036
liechtenstein's
0.9983331434860857
current
1.0
constitution
1.0
adopted
0.14930425420957738
what
0.3415796380162235
is
1.0
the
0.998753227689618
official
0.40552428364014037
language
0.05076536650657659
of
1.0
liechtenstein
0.9981797741691383
what
0.3415796380162235
countries
1.0
border
0.9951498580706452
liechtenstein
0.9981797741691383
what
0.3415796380162235
roman
0.10727613067568897
province
1.0
was
0.30523138894637036
liechtenstein
0.9981797741691383
part
0.027883275252870265
of
1.0
how
0.996570431752036
many
0.18088980513486008
municipalities
0.19287325430452373
are
0.20007627195421962
within
0.9982958338364617
oberland
0.2555201960413751
what
0.3415796380162235
is
1.0
the
0.998753227689618
smallest
0.15

0.016877150763210613
on
1.0
what
0.3415796380162235
date
0.06997056375726374
who
0.9956326880614242
or
1.0
what
0.3415796380162235
fell
1.0
in
1.0
love
1.0
with
0.9964915808020913
abigail
0.009510427966890944
powers
1.0
was
0.30523138894637036
fillmore
0.99681915161551
one
1.0
of
1.0
the
0.998753227689618
founders
1.0
of
1.0
the
0.998753227689618
university
1.0
of
1.0
buffalo
0.016877150763210613
was
0.30523138894637036
another
0.5148580357238579
primary
0.022566031805386855
objective
0.9904057333557558
of
1.0
fillmore
0.99681915161551
to
1.0
preserve
1.0
the
0.998753227689618
union
1.0
from
0.9967665720470967
the
0.998753227689618
intensifying
1.0
slavery
0.04594863211061995
debate
0.09192925063701074
was
0.30523138894637036
fillmore
0.99681915161551
the
0.998753227689618
second
1.0
chancellor
0.1507631579337353
a
0.0014124300834635894
position
1.0
he
0.9914556957747004
maintained
0.31936017290691576
while
0.9884968783961164
both
0.9762392541860054
vice
1.0
president
1.0
and
0.2308937

0.017571446560694204
otters
1.0
dog
1.0
otters
1.0
females
0.07206570853427019
are
0.20007627195421962
bitches
0.9928140569026376
and
0.23089371888793853
babies
0.04923934869389168
are
0.20007627195421962
cubs
0.9934928517672643
or
1.0
pups
1.0
is
1.0
an
0.09934735764104985
otter
1.0
's
1.0
den
1.0
called
0.07590788486807809
a
0.0014124300834635894
holt
0.9794225905122159
have
0.11337070701689356
most
0.9969013579844116
otters
1.0
fish
0.9982406686102951
as
0.1303373685321596
the
0.998753227689618
primary
0.022566031805386855
item
0.996932500905631
in
1.0
their
0.9987636232899905
diet
1.0
supplemented
0.9964579571442543
by
0.9949494734783132
frogs
1.0
crayfish
0.36261536356151824
and
0.23089371888793853
crabs
0.040195635671492314
do
1.0
penguins
1.0
feed
1.0
on
1.0
krill
1.0
what
0.3415796380162235
is
1.0
the
0.998753227689618
largest
0.08906289224343966
living
1.0
species
1.0
of
1.0
penguin
1.0
do
1.0
penguins
1.0
live
1.0
almost
0.09494084655962542
exclusively
1.0
in
1.0
the
0.998753

0.024935971699646786
bear
0.05124941349580736
at
0.059520466820536355
high
0.9815950054337863
risk
1.0
of
1.0
extinction
1.0
how
0.996570431752036
heavy
0.03068297146720178
is
1.0
a
0.0014124300834635894
male
0.017571446560694204
polar
0.024935971699646786
bear
0.05124941349580736
how
0.996570431752036
heavy
0.03068297146720178
was
0.30523138894637036
the
0.998753227689618
largest
0.08906289224343966
polar
0.024935971699646786
bear
0.05124941349580736
on
1.0
record
1.0
what
0.3415796380162235
does
1.0
a
0.0014124300834635894
polar
0.024935971699646786
bear's
0.10207561098645002
fur
1.0
provide
1.0
do
1.0
female
0.03424892927623546
polar
0.024935971699646786
bears
0.08551170966181143
weight
0.9913044574100541
more
0.9960724698676001
than
0.13197839621281993
the
0.998753227689618
male
0.017571446560694204
how
0.996570431752036
much
0.9862265224672251
weight
0.9913044574100541
do
1.0
female
0.03424892927623546
polar
0.024935971699646786
bears
0.08551170966181143
gain
0.3190865909900583
du

0.998753227689618
development
0.9959721763611799
of
1.0
qatar
0.020464317029569523
and
0.23089371888793853
the
0.998753227689618
wider
1.0
region
1.0
develop
1.0
local
0.013895265404319734
and
0.23089371888793853
regional
0.10012411728737014
markets
0.04180874080840813
and
0.23089371888793853
strengthen
0.9987755012281311
the
0.998753227689618
links
1.0
between
0.9946146458045795
the
0.998753227689618
energy
1.0
based
0.13815168232123187
economies
0.9974166997448074
and
0.23089371888793853
global
0.006646668699224922
financial
0.14351094841254697
markets
0.04180874080840813
what
0.3415796380162235
happened
0.0448481321139671
in
1.0
these
0.9987715800336268
positions
1.0
in
1.0
english
0.998267069506929
can
0.13173441783815665
these
0.9987715800336268
allophones
0.147498106129343
not
1.0
occur
1.0
in
1.0
these
0.9987715800336268
positions
1.0
in
1.0
english
0.998267069506929
is
1.0
rrb
0.9899592522898898
officially
0.4110379968726944
the
0.998753227689618
state
0.10929117346891715
of
1.

0.3415796380162235
does
1.0
singa
0.33182728143713014
mean
0.10727613067568897
when
0.9963041881540179
is
1.0
the
0.998753227689618
first
1.0
record
1.0
of
1.0
s08_settlement
0.9976262979349999
in
1.0
singapore
0.3492250367919393
which
0.989229291609159
nation
0.13588551212762057
invaded
0.33214462779488485
singapore
0.3492250367919393
during
1.0
world
1.0
war
0.017953326796015023
ii
1.0
why
0.9935569631655717
churchill
0.9882174096028005
called
0.07590788486807809
the
0.998753227689618
occupation
0.15960049735824
of
1.0
singapor
0.34297459641025696
by
0.9949494734783132
janpan
0.10051174422015707
during
1.0
wwii
1.0
britain's
0.3680353993441069
greatest
0.11228516118901766
defeat
0.10244239305654279
was
0.30523138894637036
lee
1.0
kuan
0.10702569589731126
yew
1.0
a
0.0014124300834635894
successful
1.0
leader
0.04502281785309081
of
1.0
singapore
0.3492250367919393
when
0.9963041881540179
did
1.0
goh
0.9837321294181609
chok
0.9846625045281552
tong
1.0
succeed
1.0
lee
1.0
as
0.1303373685

0.20332580628697217
for
1.0
the
0.998753227689618
young
1.0
are
0.20007627195421962
turtles
1.0
pets
1.0
what
0.3415796380162235
shape
0.042995265449381925
are
0.20007627195421962
the
0.998753227689618
eggs
1.0
of
1.0
the
0.998753227689618
larest
0.220516615102602
species
1.0
of
1.0
turtle
1.0
how
0.996570431752036
often
1.0
do
1.0
turtles
1.0
breed
0.9872384323782462
what
0.3415796380162235
do
1.0
all
0.023947563915909353
turtles
1.0
and
0.23089371888793853
tortoises
1.0
breathe
0.5029205758657685
how
0.996570431752036
do
1.0
turtles
1.0
reproduce
1.0
what
0.3415796380162235
has
0.1615900171085909
been
0.9950092451282517
discovered
1.0
about
0.03260813131898477
turtles
1.0
organs
0.1499588874314154
what
0.3415796380162235
suborder
0.99606888798803
of
1.0
turtle
1.0
draws
0.03846576165498272
its
1.0
head
0.040195635671492314
into
1.0
its
1.0
shell
0.9943786801011621
do
1.0
sea
0.04594863211061995
turtles
1.0
lay
0.010818488084989086
eggs
1.0
on
1.0
dry
1.0
sandy
0.25446462548105375
bea

0.0957268294724849
james
0.06951058403320343
buchanan
0.10111474994211778
who
0.9956326880614242
did
1.0
president
1.0
lincoln
1.0
promote
0.9891547529454406
of
1.0
major
0.042741287266996
general
0.017953326796015023
in
1.0
the
0.998753227689618
regular
0.015930571463950827
army
0.008384901769984632
effective
1.0
july
1.0
4
1.0
was
0.30523138894637036
grant
0.11451094364388048
elected
1.0
president
1.0
as
0.1303373685321596
a
0.0014124300834635894
republican
0.15970946898995664
what
0.3415796380162235
was
0.30523138894637036
from
0.9967665720470967
pennsylvania
0.04105782043785078
was
0.30523138894637036
it
1.0
a
0.0014124300834635894
two
1.0
sentence
1.0
description
1.0
that
0.12066819154930475
completely
0.9936038222371705
caught
0.017882903301925768
the
0.998753227689618
essence
1.0
of
1.0
ulysses
1.0
s
1.0
grant
0.11451094364388048
was
0.30523138894637036
grant
0.11451094364388048
's
1.0
favorite
0.11745277588245417
brand
0.2311949007822265
of
1.0
bourbon
0.9910617444905372
whiske

0.040195635671492314
of
1.0
government
0.994739282453898
was
0.30523138894637036
woodrow
1.0
wilson
1.0
the
0.998753227689618
thirtieth
0.9874769873872442
president
1.0
of
1.0
the
0.998753227689618
united
1.0
states
0.18378973776356955
did
1.0
woodrow
1.0
wilson
1.0
create
0.08173292698078183
the
0.998753227689618
league
0.02096420768880125
of
1.0
nations
0.16023166492954544
did
1.0
the
0.998753227689618
u
1.0
s
1.0
join
1.0
the
0.998753227689618
league
0.02096420768880125
of
1.0
nations
0.16023166492954544
where
0.9961788709863583
was
0.30523138894637036
the
0.998753227689618
league
0.02096420768880125
of
1.0
nations
0.16023166492954544
created
0.08825710959340038
when
0.9963041881540179
was
0.30523138894637036
woodrow
1.0
wilson
1.0
born
0.9950092451282517
who
0.9956326880614242
was
0.30523138894637036
president
1.0
when
0.9963041881540179
wilson
1.0
finished
0.9987047725951704
congressional
0.3034393596321616
government
0.994739282453898
what
0.3415796380162235
field
1.0
did
1.0
woo

1.0
s
1.0
1794
1.0
copley
1.0
medal
0.013520177606347272
did
1.0
alessandro
0.1132146084534128
volta
0.050537650386353716
experiment
0.9946601694800603
with
0.9964915808020913
individual
0.3423698789296593
cells
1.0
when
0.9963041881540179
did
1.0
lombardy
0.010405864361872075
come
0.9938650018112621
under
1.0
napoleon
0.08663910851512835
s
1.0
rule
1.0
where
0.9961788709863583
did
1.0
alessandro
0.1132146084534128
volta
0.050537650386353716
publish
0.9965168034095235
alessandro
0.1132146084534128
volta
0.050537650386353716
invention
1.0
of
1.0
the
0.998753227689618
voltaic
0.063042874451673
pile
1.0
battery
0.06885299433994074
did
1.0
alessandro
0.1132146084534128
volta
0.050537650386353716
become
0.9916820752137528
professor
1.0
of
1.0
experimental
0.03615564598458365
physics
0.9964287384479528
at
0.059520466820536355
the
0.998753227689618
university
1.0
of
1.0
pavia
0.004577533354175434
is
1.0
it
1.0
true
1.0
that
0.12066819154930475
alessandro
0.1132146084534128
volta
0.05053765038

0.3415796380162235
was
0.30523138894637036
anders
0.2665179544046743
celsius
1.0
s
1.0
profession
1.0
how
0.996570431752036
old
1.0
was
0.30523138894637036
anders
0.2665179544046743
celsius
1.0
when
0.9963041881540179
he
0.9914556957747004
died
1.0
what
0.3415796380162235
is
1.0
anders
0.2665179544046743
celsius
1.0
s
1.0
last
0.13997763214682246
name

0.3620656570027345
pascal
0.04432668568764642
a
0.0014124300834635894
mathematician
0.15891508552584666
of
1.0
the
0.998753227689618
first
1.0
order
1.0
could
1.0
blaise
0.3620656570027345
pascal
0.04432668568764642
move
0.9874769873872442
without
0.9966879487546988
crutches
0.9951498580706452
has
0.1615900171085909
the
0.998753227689618
name
0.03329131766692328
pascal
0.04432668568764642
been
0.9950092451282517
given
1.0
to
1.0
the
0.998753227689618
si
1.0
unit
1.0
of
1.0
pressure
1.0
from
0.9967665720470967
what
0.3415796380162235
did
1.0
pascal
0.04432668568764642
suffer
1.0
throughout
0.9882174096028005
his
0.9982864054593203
life
1.

0.9896594626623516
born
0.9950092451282517
did
1.0
charles
0.06713659287284235
augustin
0.3503954963062701
de
1.0
coulomb
0.9896594626623516
publish
0.9965168034095235
an
0.09934735764104985
important
0.14479948506932883
investigation
0.3877608831325615
of
1.0
the
0.998753227689618
laws
0.05144753305784999
of
1.0
friction
1.0
was
0.30523138894637036
coulomb
0.9896594626623516
born
0.9950092451282517
in
1.0
angoulême
0.11102866460188243
france
0.11451094364388048
to
1.0
a
0.0014124300834635894
well
1.0
to
1.0
do
1.0
family
0.04644442963087503
was
0.30523138894637036
charles
0.06713659287284235
augustin
0.3503954963062701
de
1.0
coulomb
0.9896594626623516
father
0.5022809897285145
inspector
1.0
of
1.0
the
0.998753227689618
royal
0.009786477880557198
fields
1.0
charles
0.06713659287284235
augustin
0.3503954963062701
de
1.0
coulomb
0.9896594626623516
discovered
1.0
an
0.09934735764104985
inverse
1.0
relationship
0.17500544240245974
of
1.0
what
0.3415796380162235
is
1.0
it
1.0
true
1.0
that

differences
1.0
between
0.9946146458045795
british
0.9967382613184327
and
0.23089371888793853
american
0.04596652304698834
english
0.998267069506929
is
1.0
the
0.998753227689618
flute
1.0
a
0.0014124300834635894
musical
0.0762778695982399
instrument
0.9987528842734676
is
1.0
it
1.0
possible
0.9963041881540179
to
1.0
open
1.0
flutes
1.0
at
0.059520466820536355
one
1.0
or
1.0
both
0.9762392541860054
ends
1.0
are
0.20007627195421962
indian
0.3609310913613094
concert
1.0
flutes
1.0
available
0.0032887858618838584
in
1.0
standard
0.08986654225233637
pitches
0.9964287384479528
what
0.3415796380162235
do
1.0
we
1.0
refer
1.0
musicians
0.218401533990226
who
0.9956326880614242
play
0.02096420768880125
flute
1.0
when
0.9963041881540179
was
0.30523138894637036
a
0.0014124300834635894
three
0.9897112952561079
holed
0.9872384323782462
flute
1.0
made
0.06259235185711698
from
0.9967665720470967
a
0.0014124300834635894
mammoth
0.01971706039093668
tusk
1.0
discovered
1.0
when
0.9963041881540179
did
1.0

0.04644442963087503
to
1.0
occupy
1.0
the
0.998753227689618
physics
0.9964287384479528
chair
0.03068297146720178
at
0.059520466820536355
the
0.998753227689618
museum
0.9940965952404238
national
0.04502281785309081
d'histoire
0.9985738988257448
naturelle
0.07506665381983513
was
0.30523138894637036
henri
0.9943362544434455
becquerel
0.9955096411150537
the
0.998753227689618
sole
1.0
winner
1.0
of
1.0
the
0.998753227689618
1903
1.0
nobel
0.9937384936936221
prize
1.0
in
1.0
physics
0.9964287384479528
did
1.0
henri
0.9943362544434455
becquerel
0.9955096411150537
intentionally
0.3609310913613094
discover
1.0
radioactivity
0.04525452407563202
if
1.0
henri
0.9943362544434455
becquerel
0.9955096411150537
was
0.30523138894637036
alive
0.04409430311538187
today
0.22202480752144704
how
0.996570431752036
old
1.0
would
1.0
he
0.9914556957747004
have
0.11337070701689356
been
0.9950092451282517
for
1.0
how
0.996570431752036
many
0.18088980513486008
years
0.08906289224343966
did
1.0
henri
0.994336254443

0.989229291609159
part
0.027883275252870265
of
1.0
the
0.998753227689618
strings
1.0
does
1.0
the
0.998753227689618
left
1.0
hand
0.2311949007822265
touch
0.9973390292049003
in
1.0
which
0.989229291609159
place
0.017953326796015023
is
1.0
lyre
1.0
still
1.0
played
0.03846576165498272
does
1.0
a
0.0014124300834635894
classical
0.07146561721068945
lyre
1.0
have
0.11337070701689356
a
0.0014124300834635894
fingerboard
0.31998637419545983
is
1.0
a
0.0014124300834635894
guitar
0.10397799316656342
played
0.03846576165498272
with
0.9964915808020913
a
0.0014124300834635894
plectrum
0.9931408635040719
like
1.0
a
0.0014124300834635894
lyre
1.0
was
0.30523138894637036
michael
0.03124175862310763
faraday
0.00782828020503501
an
0.09934735764104985
english
0.998267069506929
chemist
0.9971360536770472
was
0.30523138894637036
faraday
0.00782828020503501
a
0.0014124300834635894
devout
1.0
christian
0.39579203784068384
was
0.30523138894637036
faraday
0.00782828020503501
considered
1.0
a
0.001412430083463

0.005364117804667057
sit
1.0
on
1.0
the
0.998753227689618
ottowa
0.21284183154035086
river
1.0
do
1.0
small
0.06259235185711698
tremors
0.9964579571442543
occur
1.0
in
1.0
ottawa
0.005364117804667057
what
0.3415796380162235
is
1.0
ottawa's
0.021213866188923514
population
0.15833388096105727
what
0.3415796380162235
are
0.20007627195421962
ottawa's
0.021213866188923514
primary
0.022566031805386855
industries
1.0
what
0.3415796380162235
is
1.0
ottawa's
0.021213866188923514
junior
1.0
ice
1.0
hockey
0.9877026457371058
team
0.008384901769984632
what
0.3415796380162235
is
1.0
ottawa's
0.021213866188923514
major
0.042741287266996
league
0.02096420768880125
hockey
0.9877026457371058
team
0.008384901769984632
what
0.3415796380162235
are
0.20007627195421962
carleton
0.25666061073578306
university's
1.0
athletic
0.07812675833199034
teams
0.04087346361240218
called
0.07590788486807809
is
1.0
ottawa
0.005364117804667057
colder
1.0
than
0.13197839621281993
moscow
0.9953221077361217
in
1.0
january
0.

1.0
how
0.996570431752036
many
0.18088980513486008
groups
1.0
are
0.20007627195421962
turtles
1.0
broken
0.9812154810808664
down
1.0
into
1.0
where
0.9961788709863583
are
0.20007627195421962
the
0.998753227689618
only
1.0
surviving
1.0
giant
0.12108916227565147
tortoises
1.0
how
0.996570431752036
do
1.0
turtles
1.0
chew
0.9922224702670546
food
1.0
are
0.20007627195421962
tortoises
1.0
land
0.23758588335726682
based
0.13815168232123187
where
0.9961788709863583
are
0.20007627195421962
turtle
1.0
eggs
1.0
layed
0.014914495376515013
is
1.0
turtle
1.0
soup
1.0
considered
1.0
a
0.0014124300834635894
delicacy
0.063042874451673
are
0.20007627195421962
testudines
1.0
the
0.998753227689618
crown
1.0
group
1.0
of
1.0
the
0.998753227689618
superorder
1.0
chelonia
0.11188928694793387
do
1.0
turtles
1.0
breathe
0.5029205758657685
air
0.02096420768880125
are
0.20007627195421962
harvesting
0.33136135907513387
wild
1.0
turtles
1.0
legal
0.010818488084989086
anywhere
0.198923426842468
approximately
0.02

0.0014124300834635894
german
0.1183939227163785
call
0.031771757737021566
avogadro's
0.03007629735255757
number
0.9910244096742992
is
1.0
amedeo
0.06259235185711698
avogadro
0.013895265404319734
italian
0.056852434415643804
did
1.0
amedeo
0.06259235185711698
avogadro
0.013895265404319734
become
0.9916820752137528
a
0.0014124300834635894
professor
1.0
before
0.9969791322708849
the
0.998753227689618
revolutionary
0.10012411728737014
movements
0.9920129126693888
against
0.1271691881618816
the
0.998753227689618
king
1.0
of
1.0
sardinia
0.12998705124444998
do
1.0
ants
0.14065401633942576
belong
0.9963737080638982
to
1.0
the
0.998753227689618
hymenoptera
0.02247453701430957
order
1.0
are
0.20007627195421962
ants
0.14065401633942576
used
1.0
in
1.0
cuisine
1.0
does
1.0
an
0.09934735764104985
ant's
0.14534443880325887
head
0.040195635671492314
contain
0.34951824471995996
sensory
1.0
organs
0.1499588874314154
how
0.996570431752036
do
1.0
most
0.9969013579844116
ants
0.14065401633942576
travel
0

0.9981104936509584
the
0.998753227689618
headquarters
0.024147766821524197
of
1.0
springer
1.0
does
1.0
the
0.998753227689618
gendarmenmarkt
0.010448619091699474
border
0.9951498580706452
the
0.998753227689618
french
0.9918660647090805
cathedral
0.24639177702449078
is
1.0
the
0.998753227689618
nauen
0.011847278287329566
plain
0.32009261972319625
north
0.9934928517672643
of
1.0
berlin
0.9981104936509584
where
0.9961788709863583
is
1.0
the
0.998753227689618
bust
0.9937384936936221
of
1.0
queen
1.0
nefertiti
1.0
where
0.9961788709863583
does
1.0
the
0.998753227689618
german
0.1183939227163785
president
1.0
live
1.0
when
0.9963041881540179
did
1.0
berlin
0.9981104936509584
give
1.0
up
1.0
its
1.0
status
0.16953171173876014
as
0.1303373685321596
a
0.0014124300834635894
free
1.0
hanseatic
0.05918626294038409
city
1.0
which
0.989229291609159
is
1.0
the
0.998753227689618
busiest
0.9964287384479528
airport
0.058633640095275164
in
1.0
berlin
0.9981104936509584
how
0.996570431752036
many
0.180889

0.3492250367919393
where
0.9961788709863583
it
1.0
is
1.0
called
0.07590788486807809
huayu
0.031646986652023035
how
0.996570431752036
many
0.18088980513486008
head
0.040195635671492314
entries
1.0
for
1.0
character
0.008735295797627773
definitions
1.0
does
1.0
the
0.998753227689618
zhonghua
0.09474084879883526
zihai
0.03260813131898477
20013
1.0
21326
1.0
23383
1.0
28023
1.0
1994
1.0
contain
0.34951824471995996
what
0.3415796380162235
does
1.0
the
0.998753227689618
prc
1.0
government
0.994739282453898
classify
0.20865479046799118
as
0.1303373685321596
literacy
0.09543315996233781
amongst
0.11620359368271238
workers
1.0
how
0.996570431752036
is
1.0
south
0.9942029716067028
china
0.323623720916155
linguistically
0.48603833063718715
different
1.0
from
0.9967665720470967
north
0.9934928517672643
china
0.323623720916155
20415
1.0
24403
1.0
20415
1.0
30070
1.0
lunchbox
0.9799185045797797
or
1.0
boxed
0.9826089148201084
lunch
0.9888403328477327
from
0.9967665720470967
bento
0.9974670959856564

0.9963071650468457
are
0.20007627195421962
usually
0.08906289224343966
played
0.03846576165498272
by
0.9949494734783132
what
0.3415796380162235
has
0.1615900171085909
the
0.998753227689618
design
1.0
of
1.0
drums
0.9963071650468457
changed
0.10815300325234345
recently
1.0
can
0.13173441783815665
some
0.9958637850649407
cylindrical

0.059520466820536355
least
0.14334368172770118
one
1.0
membrane
0.10788742313404454
called
0.07590788486807809
a
0.0014124300834635894
drumhead
0.06409062321543502
or
1.0
drum
0.9934600699566035
skin
1.0
that
0.12066819154930475
is
1.0
stretched
0.9949494734783132
over
1.0
a
0.0014124300834635894
shell
0.9943786801011621
and
0.23089371888793853
struck
1.0
either
0.9987957981936126
directly
1.0
with
0.9964915808020913
parts
0.06216598410949137
of
1.0
a
0.0014124300834635894
player's
0.0800798796067107
body
0.9872384323782462
or
1.0
with
0.9964915808020913
some
0.9958637850649407
sort
1.0
of
1.0
implement
0.9889812179737801
such
0.9935569631655717
as
0.1303373

0.10397799316656342
where
0.9961788709863583
is
1.0
the
0.998753227689618
headstock
0.2519960115927774
located
0.08255611591631806
whom
0.9929403454832446
are
0.20007627195421962
guitars
0.13103673545787653
made
0.06259235185711698
and
0.23089371888793853
repaired
0.03846576165498272
by
0.9949494734783132
what
0.3415796380162235
are
0.20007627195421962
modern
0.9958256624624148
guitar
0.10397799316656342
strings
1.0
constructed
1.0
of
1.0
why
0.9935569631655717
are
0.20007627195421962
harp
0.005605962985514301
guitars
0.13103673545787653
difficult
1.0
to
1.0
classify
0.20865479046799118
what
0.3415796380162235
is
1.0
the
0.998753227689618
bridge
0.9888403328477327
used
1.0
for
1.0
which
0.989229291609159
guitars
0.13103673545787653
use
1.0
three
0.9897112952561079
single
1.0
coil
1.0
pickups
1.0
is
1.0
a
0.0014124300834635894
guitar
0.10397799316656342
an
0.09934735764104985
instrument
0.9987528842734676
can
0.13173441783815665
guitars
0.13103673545787653
be
0.9929001192059979
divided


located
0.08255611591631806
at
0.059520466820536355
the
0.998753227689618
northern
0.9987772319693097
coast
0.1392996483065232
of
1.0
where
0.9961788709863583
in
1.0
1602
1.0
the
0.998753227689618
british
0.9967382613184327
east
0.1386200585032955
india
0.3273363916868355
company's
0.1558684013990861
first
1.0
voyage
0.013895265404319734
commanded
0.2558683405916651
by
0.9949494734783132
sir
1.0
who
0.9956326880614242
arrived
0.015930571463950827
in
1.0
aceh
0.009749289493884428
and
0.23089371888793853
sailed
0.04962514431354692
on
1.0
to
1.0
banten
0.10965316747123399
where
0.9961788709863583
they
0.9987577796300078
were
1.0
allowed
0.036567370392522
to
1.0
build
0.9906077405404332
a
0.0014124300834635894
trading
0.32175947042958786
post
1.0
who
0.9956326880614242
began
0.13197839621281993
control
1.0
of
1.0
migration
0.1474430787665797
to
1.0
the
0.998753227689618
city
1.0
in
1.0
order
1.0
to
1.0
stem
0.9952093712684251
the
0.998753227689618
overcrowding
1.0
and
0.23089371888793853
p

0.3415796380162235
percentage
0.040352974933231756
of
1.0
the
0.998753227689618
korean
0.14467907820330217
language
0.05076536650657659
does
1.0
jeong
1.0
jae
0.004577533354175434
do
1.0
estimate
0.10194183899955434
to
1.0
be
0.9929001192059979
sino
1.0
korean
0.14467907820330217
do
1.0
chinese
0.9981277784797836
and
0.23089371888793853
japanese
0.040352974933231756
have
0.11337070701689356
spaces
0.12108916227565147
between
0.9946146458045795
words
1.0
from
0.9967665720470967
which
0.989229291609159
languages
0.062385437491667695
is
1.0
korean
0.14467907820330217
descended
1.0
are
0.20007627195421962
therebetween
0.9946146458045795
words
1.0
are
0.20007627195421962
the
0.998753227689618
korean
0.14467907820330217
names
0.06951058403320343
for
1.0
the
0.998753227689618
language
0.05076536650657659
based
0.13815168232123187
on
1.0
the
0.998753227689618
names
0.06951058403320343
for
1.0
korea
0.042229119494222145
used
1.0
in
1.0
north
0.9934928517672643
as
0.1303373685321596
well
1.0
as


0.9961656261320389
what
0.3415796380162235
is
1.0
the
0.998753227689618
caridoid
0.1670819474549392
escape
0.04502281785309081
reaction
0.09934735764104985
were
1.0
the
0.998753227689618
recitations
0.22102032008890105
of
1.0
the
0.998753227689618
ancient
0.11524915890880694
greeks
1.0
accompanied
0.03778527033272849
by
0.9949494734783132
lyre
1.0
playing
0.3257104197513566
does
1.0
a
0.0014124300834635894
classical
0.07146561721068945
lyre
1.0
have
0.11337070701689356
a
0.0014124300834635894
hollow
0.9826089148201084
body
0.9872384323782462
is
1.0
the
0.998753227689618
lyre
1.0
a
0.0014124300834635894
stringed
1.0
musical
0.0762778695982399
instrument
0.9987528842734676
which
0.989229291609159
constellation
0.2885856497933874
is
1.0
said
0.04409430311538187
to
1.0
resemble
0.9893653603152052
a
0.0014124300834635894
lyre
1.0
shape
0.042995265449381925
how
0.996570431752036
many
0.18088980513486008
raised
0.36203962456126304
arms
0.03993544232573176
are
0.20007627195421962
extending
1.0

0.9964915808020913
kinematics
0.35552754511128215
was
0.30523138894637036
michael
0.03124175862310763
faraday
0.00782828020503501
active
0.025921382793525738
in
1.0
the
0.998753227689618
area
0.06695866318096244
now
1.0
called
0.07590788486807809
environmental
0.10040010711608494
science
1.0
did
1.0
michael
0.03124175862310763
faraday
0.00782828020503501
discover
1.0
benzene
0.9939582645417697
who
0.9956326880614242
was
0.30523138894637036
michael
0.03124175862310763
faraday
0.00782828020503501
mentor
0.9983062569688781
who
0.9956326880614242
was
0.30523138894637036
faraday's
0.014461894615385273
wife
1.0
did
1.0
michael
0.03124175862310763
faraday
0.00782828020503501
advise
0.3614860796628149
the
0.998753227689618
british
0.9967382613184327
government
0.994739282453898
on
1.0
the
0.998753227689618
production
1.0
of
1.0
chemical
0.0614799922676631
weapons
0.11228516118901766
for
1.0
use
1.0
in
1.0
the
0.998753227689618
crimean
0.11766562347834919
war
0.017953326796015023
did
1.0
farada

magazine
0.12064791396382257
who
0.9956326880614242
became
0.0606049151472734
adversaries
0.03728058403470713
with
0.9964915808020913
nikola
0.019963659862726746
tesla
0.060404791918005296
where
0.9961788709863583
did
1.0
nikola
0.019963659862726746
tesla
0.060404791918005296
move
0.9874769873872442
to
1.0
in
1.0
1880
1.0
was
0.30523138894637036
nikola
0.019963659862726746
tesla's
0.1525623885313333
mother
0.997945297763546
illiterate
0.1544993913189796
how
0.996570431752036
many
0.18088980513486008
siblings
0.9982658391691858
did
1.0
nikola
0.019963659862726746
tesla
0.060404791918005296
have
0.11337070701689356
was
0.30523138894637036
nikola
0.019963659862726746
tesla
0.060404791918005296
a
0.0014124300834635894
vegetarian
0.033458357921607895
is
1.0
there
0.9987789553923376
a
0.0014124300834635894
monument
0.9941237374750427
to
1.0
tesla
0.060404791918005296
at
0.059520466820536355
niagara
0.0024708167824638583
falls
0.058633640095275164
new
1.0
york
1.0
was
0.30523138894637036
tesl

0.24551932440761115
the
0.998753227689618
artists
0.38994789232525173
rights
0.9932883890602011
society
1.0
is
1.0
pablo
0.008718315023598433
picasso
0.20332580628697217
one
1.0
of
1.0
the
0.998753227689618
most
0.9969013579844116
recognized
1.0
figures
1.0
in
1.0
20th
0.9884968783961164
century
1.0
art
0.009786477880557198
is
1.0
pablo
0.008718315023598433
picasso
0.20332580628697217
best
0.9951765808190722
known
1.0
for
1.0
co
1.0
founding
1.0
the
0.998753227689618
cubist
0.9982559012034091
movement
0.9879682786614497
and
0.23089371888793853
for
1.0
the
0.998753227689618
wide
1.0
variety
0.016943509898674325
of
1.0
styles
1.0
embodied
0.9902606806360543
in
1.0
pablo
0.008718315023598433
picasso
0.20332580628697217
work
1.0
are
0.20007627195421962
among
0.08777352620273682
pablo
0.008718315023598433
picasso
0.20332580628697217
most
0.9969013579844116
famous
0.05818928559876779
works
1.0
the
0.998753227689618
proto
1.0
cubist
0.9982559012034091
les
1.0
demoiselles
0.9989307370332343
d'

0.13197839621281993
in
1.0
1977
1.0
how
0.996570431752036
does
1.0
poverty
1.0
in
1.0
san
0.13450843637292464
francisco
0.39637092441649757
compare
0.20541296665558761
to
1.0
the
0.998753227689618
nation
0.13588551212762057
wide
1.0
average
0.00745259440238244
was
0.30523138894637036
mission
0.9989711938239876
bay
0.005605962985514301
campus
0.0537235862363582
opened
1.0
in
1.0
2003
1.0
was
0.30523138894637036
the
0.998753227689618
university
1.0
of
1.0
san
0.13450843637292464
fransisco
0.41363065217432693
founded
1.0
in
1.0
1855
1.0
is
1.0
golden
1.0
gate
0.06391730966367948
park
0.011847278287329566
the
0.998753227689618
largest
0.08906289224343966
city
1.0
park
0.011847278287329566
what
0.3415796380162235
is
1.0
northern
0.9987772319693097
california's
0.06609273135948845
most
0.9969013579844116
widely
1.0
circulated
0.0946441787045934
newspaper
0.06825305641016932
what
0.3415796380162235
makes
0.051013481172676234
san
0.13450843637292464
francisco
0.39637092441649757
among
0.087773

0.9952093712684251
fully
1.0
chromatic
0.08503084317299481
were
1.0
trumpet
0.9952093712684251
players
0.058633640095275164
heavily
0.04206477934280317
guarded
0.050537650386353716
are
0.20007627195421962
pocket
1.0
trumpets
0.9966329823188754
compact
0.03329131766692328
b
0.9734346785819072
trumpets
0.9966329823188754
was
0.30523138894637036
dizzy
1.0
gillespie
1.0
a
0.0014124300834635894
famous
0.05818928559876779
trumpeter
0.9956619011781762
in
1.0
1998
1.0
what
0.3415796380162235
shape
0.042995265449381925
is
1.0
a
0.0014124300834635894
trumpet
0.9952093712684251
bent
0.9932883890602011
into
1.0
what
0.3415796380162235
trumpet
0.9952093712684251
was
0.30523138894637036
the
0.998753227689618
first
1.0
to
1.0
be
0.9929001192059979
allowed
0.036567370392522
in
1.0
the
0.998753227689618
christian
0.39579203784068384
church
0.9867173392909536
the
0.998753227689618
trumpet
0.9952093712684251
can
0.13173441783815665
be
0.9929001192059979
confused
1.0
with
0.9964915808020913
what
0.3415796

been
0.9950092451282517
borrowed
0.9933586696454768
from
0.9967665720470967
chinese
0.9981277784797836
is
1.0
vietnam
0.017571446560694204
independent
1.0
from
0.9967665720470967
france
0.11451094364388048
was
0.30523138894637036
vietnamese
0.09729267163520217
formally
0.16896617669517133
written
1.0
using
1.0
the
0.998753227689618
chinese
0.9981277784797836
writing
1.0
system
0.99743482839163
does
1.0
vietnamese
0.09729267163520217
have

0.998753227689618
circle
1.0
of
1.0
the
0.998753227689618
british
0.9967382613184327
australian
0.02096420768880125
artist
0.3645128810741525
john
0.9907975027168932
peter
1.0
russell
1.0
were
1.0
the
0.998753227689618
letters
1.0
first
1.0
annotated
0.05325947829772304
in
1.0
1913
1.0
by
0.9949494734783132
theo
0.9987536851368213
s
1.0
widow
1.0
johanna
0.030017607944620273
van
0.10550573853502454
gogh
0.9891547529454406
bonger
0.9951765808190722
was
0.30523138894637036
kee
1.0
seven
1.0
years
0.08906289224343966
older
1.0
than
0.13197839621281993
va

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = list('|'.join(text.split()).split('|'))

vectorizer = CountVectorizer()
print( vectorizer.fit_transform(corpus).todense() )
print( vectorizer.vocabulary_ )

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


{'was': 3965, 'abraham': 181, 'lincoln': 2207, 'the': 3671, 'sixteenth': 3390, 'president': 2912, 'of': 2601, 'united': 3838, 'states': 3489, 'did': 1135, 'sign': 3359, 'national': 2494, 'banking': 463, 'act': 208, '1863': 63, 'mother': 2446, 'die': 1136, 'pneumonia': 2828, 'how': 1864, 'many': 2308, 'long': 2242, 'formal': 1560, 'education': 1258, 'when': 3999, 'begin': 507, 'his': 1831, 'political': 2841, 'career': 681, 'what': 3997, 'legal': 2170, 'tender': 3650, '1862': 62, 'establish': 1353, 'who': 4009, 'suggested': 3554, 'grow': 1735, 'beard': 491, 'gettysburg': 1672, 'address': 218, 'argue': 372, 'that': 3670, 'america': 295, 'born': 576, 'beat': 493, 'john': 2045, 'breckinridge': 600, 'in': 1909, '1860': 61, 'election': 1277, 'first': 1518, 'start': 3484, '1832': 53, 'ever': 1378, 'represent': 3108, 'alton': 288, 'sangamon': 3236, 'railroad': 3012, 'which': 4002, 'county': 973, 'serve': 3316, 'as': 392, 'assassinated': 399, 'win': 4026, 'general': 1650, 'charge': 750, 'at': 40

# Алгоритм Шинглов — поиск нечетких дубликатов текста


In [16]:
def genshingle(source):
    import binascii
    shingleLen = 3 #длина шингла - 3--5--7
    out = [] 
    for i in range(len(source)-(shingleLen-1)):
        out.append (binascii.crc32(' '.join( [x for x in source[i:i+shingleLen]] ).encode('utf-8')))

    return out

def compaire(source1,source2):
    same = 0
    for i in range(len(source1)):
        if source1[i] in source2:
            same = same + 1

    return same*2/float(len(source1) + len(source2))*100

def test():
    text1 = exp.iloc[0]["Theme column"] # Текст 1 для сравнения - abraham lincoln sixteenth president united states
    text2 = 'lincoln sixteenth president' # Текст 2 для сравнения - обработанный
    text3 = 'was abraham linсoln the 16\'th president of Ameriсa' # Текст 3 для сравнения -не очищенный
    text4 = 'barak obama' # Текст 4 - совсем другой

    cmp1 = genshingle(text1)
    cmp2 = genshingle(text2)
    cmp3 = genshingle(text3)
    cmp4 = genshingle(text4)

    print ('\n'+text1)
    print (text2)


    print ('\n pretty near '+str(compaire(cmp1,cmp2)))
    
    print ('\n'+text1)
    print (text3)
    print ('\n Not pretty near '+str(compaire(cmp1,cmp3)))
    
    print ('\n'+text1)
    print (text4)
    print ('\n Not near almost '+str(compaire(cmp1,cmp4)))

# Start program
test()


 abraham lincoln sixteenth president united states
lincoln sixteenth president

 pretty near 68.4931506849315

 abraham lincoln sixteenth president united states
was abraham linсoln the 16'th president of Ameriсa

 Not pretty near 50.0

 abraham lincoln sixteenth president united states
barak obama

 Not near almost 0.0


# TF, IDF и TF-IDF

In [17]:
def compute_tf(text):
    #Считаем частотность всех терминов во входном массиве с помощью 
    #метода Counter библиотеки collections, но уже в очищенной Theme column
    tf_text = collections.Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i]/float(len(text))
    return tf_text
def compute_idf(word, corpus):
#на вход берется слово, для которого считаем IDF
#и корпус документов в виде списка списков слов
        #количество документов, где встречается искомый термин
        #считается как генератор списков
        return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))
def compute_idf_another(word, corpus):
    data = [Counter(i) for i in corpus if word in i]
    final_counter = Counter()
    for i in data:
        final_counter += i
    most_common_word = final_counter.most_common(1)[0][1]
    return math.log10(1 + (most_common_word/float(sum([1 for i in corpus if word in i]))))

def compute_tfidf(corpus):

    documents_list = []
    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list

def compute_tfidf_another(corpus):

    documents_list = []
    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf_another(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list


In [18]:
text = exp["Theme column"]
text_separated_by_word = list(map(lambda x: x.split(),'|'.join(text).split('|')))
tf=compute_tf(text)
#print(tf)
 

In [19]:
print(compute_idf('president',text_separated_by_word))


1.8656960599160706


In [20]:
tfidf=compute_tfidf(text_separated_by_word)

print(tfidf)

[{'abraham': 0.45679288721796174, 'lincoln': 0.3333991056355877, 'sixteenth': 0.5571362191059555, 'president': 0.3109493433193451, 'united': 0.35206139887624316, 'states': 0.356449555329968}, {'lincoln': 0.3333991056355877, 'sign': 0.4776160099860117, 'national': 0.3479241349220711, 'banking': 0.5571362191059555, 'act': 0.4406412183832856, '1863': 0.5571362191059555}, {'abraham': 0.5481514646615542, 'lincoln': 0.40007892676270534, 'mother': 0.46856346292714657, 'die': 0.41750896190648534, 'pneumonia': 0.6685634629271466}, {"lincoln's": 0.9552320199720234, 'formal': 0.9552320199720234, 'education': 0.8812824367665713}, {'lincoln': 0.5000986584533816, 'begin': 0.6244298186553691, 'political': 0.597143701299102, 'career': 0.7164240149790176}, {'legal': 0.5481514646615542, 'tender': 0.6685634629271466, 'act': 0.5287694620599428, '1862': 0.6685634629271466, 'establish': 0.5731392119832142}, {'suggested': 0.8357043286589333, 'lincoln': 0.5000986584533816, 'grow': 0.7164240149790176, 'beard':




In [21]:

tfidf_another=compute_tfidf_another(text_separated_by_word)

 
print(tfidf_another)

[{'abraham': 0.050171665943996864, 'lincoln': 0.050171665943996864, 'sixteenth': 0.050171665943996864, 'president': 0.052545072463098565, 'united': 0.050171665943996864, 'states': 0.050171665943996864}, {'lincoln': 0.050171665943996864, 'sign': 0.050171665943996864, 'national': 0.050171665943996864, 'banking': 0.050171665943996864, 'act': 0.050171665943996864, '1863': 0.050171665943996864}, {'abraham': 0.06020599913279624, 'lincoln': 0.06020599913279624, 'mother': 0.06020599913279624, 'die': 0.06020599913279624, 'pneumonia': 0.06020599913279624}, {"lincoln's": 0.10034333188799373, 'formal': 0.10034333188799373, 'education': 0.10034333188799373}, {'lincoln': 0.0752574989159953, 'begin': 0.0752574989159953, 'political': 0.0752574989159953, 'career': 0.0752574989159953}, {'legal': 0.06020599913279624, 'tender': 0.06020599913279624, 'act': 0.06020599913279624, '1862': 0.06020599913279624, 'establish': 0.06020599913279624}, {'suggested': 0.0752574989159953, 'lincoln': 0.0752574989159953, 'g




# Косинусное расстояние

In [22]:
#немного переделенные методы для tf-idf 
import operator


def tokenize(doc):
    words = [word.replace(',', '').lower() for word in doc.split()]
    return words


def build_terms(corpus):
    terms = {}
    current_index = 0
    for doc in corpus:
        for word in tokenize(doc):
            if word not in terms:
                terms[word] = current_index
                current_index += 1
    return terms


def tf(document, terms):
    words = tokenize(document)
    total_words = len(words)
    doc_counter = Counter(words)
    for word in doc_counter:
        # Можно и не делить, а оставить как есть, с частотой
        doc_counter[word] /= total_words
    tfs = [0 for _ in range(len(terms))]
    for term, index in terms.items():
        tfs[index] = doc_counter[term]
    return tfs


def _count_docs_with_word(word, docs):
    counter = 1
    for doc in docs:
        if word in doc:
            counter += 1
    return counter


def idf(documents, terms):
    idfs = [0 for _ in range(len(terms))]
    total_docs = len(documents)
    for word, index in terms.items():
        docs_with_word = _count_docs_with_word(word, documents)
        idf = 1 + math.log10(total_docs / docs_with_word)
        idfs[index] = idf
    return idfs


def _merge_td_idf(tf, idf, terms):
    return [tf[i] * idf[i] for i in range(len(terms))]


def build_tfidf(corpus, document, terms):
    doc_tf = tf(document, terms)
    doc_idf = idf(corpus, terms)
    return _merge_td_idf(doc_tf, doc_idf, terms)


def cosine_similarity(vec1, vec2):
    def dot_product2(v1, v2):
        return sum(map(operator.mul, v1, v2))

    def vector_cos5(v1, v2):
        prod = dot_product2(v1, v2)
        len1 = math.sqrt(dot_product2(v1, v1))
        len2 = math.sqrt(dot_product2(v2, v2))
        return prod / (len1 * len2)
         

    return vector_cos5(vec1, vec2)







In [23]:
tf_idf_total = []
corpus = (tuple(text))[:100]
terms = build_terms(corpus)

for document in corpus:
    tf_idf_total.append(build_tfidf(corpus, document, terms))

#for doc_rating in tf_idf_total:
    #print(doc_rating)
print(terms.keys())
query = 'president'
print("QUERY:",query )
query_tfidf = build_tfidf(corpus, query, terms)
for index, document in enumerate(tf_idf_total):
    print("Similarity with DOC", index, "=", cosine_similarity(query_tfidf, document))

dict_keys(['abraham', 'lincoln', 'sixteenth', 'president', 'united', 'states', 'sign', 'national', 'banking', 'act', '1863', 'mother', 'die', 'pneumonia', "lincoln's", 'formal', 'education', 'begin', 'political', 'career', 'legal', 'tender', '1862', 'establish', 'suggested', 'grow', 'beard', 'gettysburg', 'address', 'argue', 'america', 'born', 'beat', 'john', 'c.', 'breckinridge', '1860', 'election', 'start', '1832', 'represent', 'alton', 'sangamon', 'railroad', 'county', 'serve', 'assassinated', 'win', 'general', 'charge', 'battle', 'antietam', 'issue', 'emancipation', 'proclamation', 'scholars', 'rank', 'presidents', '18', 'months', 'schooling', 'chosen', 'presidential', 'candidate', 'old', '1816', 'photgraph', 'taken', 'trail', 'use', "farmers'", 'almanac', 'live', 'frontier', "wife's", 'family', 'support', 'slavery', 'noted', 'amedeo', 'avogadro', 'contributions', 'theory', 'molarity', 'molecular', 'weight', 'graduated', 'ecclesiastical', 'law', 'early', 'age', '20', 'began', 'prac

# Расстояние Ливенштейна

In [24]:
exmpl = exp.iloc[0]["Theme column"]
print(exmpl)
print(fuzz.token_sort_ratio(exmpl, 'lincoln sixteenth president'))
print(fuzz.token_set_ratio(exmpl, 'lincoln sixteenth president '))


 abraham lincoln sixteenth president united states
71
100


# Использование моделей для непосредственного поиска

In [25]:
input_example = 'Did the election of 1880 was won by Lincoln?'

#clear input
clear_input_exmpl=''
new_row = ''

#Stop words

for word in  input_example.split():
   
    if word not in my_stop_words:
        new_row += ' '+word
        clear_input_exmpl=new_row.strip() .lower().replace('?','')

print(clear_input_exmpl) 


did election 1880 won lincoln


## Левенштейн

In [26]:
levin_counter=0
levin_question=''
for question in text:
    if(fuzz.token_set_ratio(question, clear_input_exmpl)>=levin_counter):
        levin_counter=fuzz.token_set_ratio(question, clear_input_exmpl)
#       levin_counter=fuzz.token_sort_ratio(question, clear_input_exmpl)
        levin_question= question

        
print('it is most similar to:')
print(levin_question)
print('Levenshtein distance is:')
print(levin_counter)
print('Quesion And Answer are:')
exp.loc[exp['Theme column'] == levin_question, ['Question','Answer']]

it is most similar to:
 lincoln win election 1860
Levenshtein distance is:
85
Quesion And Answer are:


Unnamed: 0,Question,Answer
30,Did Lincoln win the election of 1860?,Yes


#### Поиск возможных ответов с пониженным расстоянием Левенштейна

In [27]:
possible_delta= 2
all_levenstain_distance = list((map(lambda x: fuzz.token_set_ratio(x, clear_input_exmpl),exp['Theme column'])))
                             #  >levin_counter-possible_delta,exp['Theme column']))

lev_possible_answers = list(map(lambda x: x>= levin_counter-possible_delta,all_levenstain_distance))
lev_possible_answers_with_distance = list(zip(exp['Theme column'][lev_possible_answers],
                                          exp['Answer'][lev_possible_answers],
                                          pd.Series(all_levenstain_distance)[lev_possible_answers]))
print('\nAlso possible questions are:')
lev_possible_answers_with_distance


Also possible questions are:


[(' lincoln win election 1860', 'Yes', 85)]

## Шинглы

In [28]:
shingle_counter=0
shingle_question=''
main_cmp = genshingle(clear_input_exmpl)
for question in text:
    tmp_cmp = genshingle(question)
    similatrity=compaire(main_cmp,tmp_cmp)
    if(similatrity>shingle_counter):
            shingle_counter=similatrity
            shingle_question=question
print('it is most similar to:')
print(shingle_question)
print('Shingle similarity procent is:')
print(shingle_counter)
print('Quesion And Answer are:')
exp.loc[exp['Theme column'] == shingle_question, ['Question','Answer']]

it is most similar to:
 lincoln win election 1860
Shingle similarity procent is:
66.66666666666666
Quesion And Answer are:


Unnamed: 0,Question,Answer
30,Did Lincoln win the election of 1860?,Yes


#### Поиск возможных ответов с пониженным показетелем схожести Шинглов

In [29]:
possible_delta= 0.05
all_shingle_distance = list((map(lambda x: compaire(main_cmp,genshingle(x)),exp['Theme column'])))
                           

Sh_possible_answers = list(map(lambda x: x>= shingle_counter-possible_delta,all_shingle_distance))
Sh_possible_answers_with_distance = list(zip(exp['Theme column'][Sh_possible_answers],
                                          exp['Answer'][Sh_possible_answers],
                                          pd.Series(all_shingle_distance)[Sh_possible_answers]))
print('\nAlso possible questions are:')
Sh_possible_answers_with_distance


Also possible questions are:


[(' lincoln win election 1860', 'Yes', 66.66666666666666)]

## TF-IDF

In [30]:
print('See is it a low coefficient in these words?:')
print('\n for Levenstain')
print(pd.Series(tfidf)[lev_possible_answers].values[0])
print(pd.Series(tfidf_another)[lev_possible_answers].values[0])
###
print('\n for Shingles')
print(pd.Series(tfidf)[Sh_possible_answers].values[0])
print(pd.Series(tfidf_another)[Sh_possible_answers].values[0])

See is it a low coefficient in these words?:

 for Levenstain
{'lincoln': 0.5000986584533816, 'win': 0.565909017147027, 'election': 0.6411665160630223, '1860': 0.6851893308269427}
{'lincoln': 0.0752574989159953, 'win': 0.0752574989159953, 'election': 0.0752574989159953, '1860': 0.0752574989159953}

 for Shingles
{'lincoln': 0.5000986584533816, 'win': 0.565909017147027, 'election': 0.6411665160630223, '1860': 0.6851893308269427}
{'lincoln': 0.0752574989159953, 'win': 0.0752574989159953, 'election': 0.0752574989159953, '1860': 0.0752574989159953}


In [31]:
clear_input_exmpl


'did election 1880 won lincoln'

# Проверка данных на тестовой выборке

## Очистка и обработка данных

In [32]:
test_answers_1 = pd.read_csv('test_questions.csv', sep=';', encoding='latin-1')

test_answers = pd.concat([test_answers_1])

test_exp=test_answers.copy()
test_exp['Theme column'] = ''
#remove words
for index, row in test_exp.iterrows():
    row['ArticleTitle'] = row['ArticleTitle'].replace('_', ' ')
    row['Answer'] = row['Answer'].lower()    
    title_words = list(map(lambda x: x.lower(),row['ArticleTitle'].split()))
    
    
    doc = nlp(row['Question'])
    infinitives = [token.lemma_ for token in doc]
      
    infinitives = list(map(lambda x: x.lower(), infinitives))
    has_title_in_question = any(map(lambda x: x in list(map(lambda x: x.lower(), infinitives)), title_words))
    
    if (not has_title_in_question):      
        s = row['Question'].lower()  
        row['Theme column'] = re.sub(r'( he )|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip()
#move all to Tc
for index, row in test_exp.iterrows():
    row['Theme column'] = row['Theme column'].lower() if row['Theme column'] != '' else row['Question'].lower() 
    row['Theme column'] =   row['Theme column'].replace('?','').strip() 
#Stop words
for index, row in test_exp.iterrows():
    new_row = ''
    for word in  row['Theme column'].split():
        if word not in my_stop_words:
            new_row += ' '+word
    row['Theme column']=new_row    
test_exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Did John C. Breckinridge was beated by Lincoln...,yes,john c. breckinridge beated lincoln 1860 elec...
1,Abraham Lincoln,Did Lincoln's political career was started in ...,yes,lincoln's political career started 1832
2,Abraham Lincoln,Did the election of 1880 was won by Lincoln?,yes,election 1880 won lincoln
3,Abraham Lincoln,For what reason the Emancipation Proclamation...,to free slaves,reason emancipation proclamation issued lincoln
4,Abraham Lincoln,Is Lincoln the first President of the United S...,no,lincoln president united states
5,Abraham Lincoln,Was pneumonia a cause of death of Lincoln's mo...,no,pneumonia cause death lincoln's mother
6,Abraham Lincoln,Was the National Banking Act of 1863 signed by...,yes,national banking act 1863 signed lincoln
7,Abraham Lincoln,When Lincoln's political career was started?,1832,lincoln's political career started
8,Alessandro Volta,Did Austria rule Lombard before 1796?,yes,austria rule lombard 1796
9,Alessandro Volta,Did electricity was Volta's passion?,yes,electricity volta's passion


# Для каждого способа составим вероятный вопрос
### В столбцах содержатся вероятные вопросы

In [33]:
test_exp['Levin_question'] = '' #предикт вопроса по левенштейну
test_exp['Levin_answer'] = ''
test_exp['Shingles_question'] = '' #предикт вопроса по шинглам
test_exp['Shingles_answer'] = ''

for index, row in test_exp.iterrows():
    
    #levin
    levin_counter=0
    levin_question=''
    for question in text:
        if(fuzz.token_set_ratio(question, row['Theme column'])>=levin_counter):
            levin_counter=fuzz.token_set_ratio(question, row['Theme column'])
            levin_question= question
    row['Levin_question'] = str(exp.loc[exp['Theme column'] == levin_question, ['Question']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()
    row['Levin_answer'] = str(exp.loc[exp['Theme column'] == levin_question, ['Answer']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()
    
    # shingle
    shingle_counter=0
    shingle_question=''
    main_cmp = genshingle(row['Theme column'])
    for question in text:
        tmp_cmp = genshingle(question)
        similatrity=compaire(main_cmp,tmp_cmp)
        if(similatrity>shingle_counter):
            shingle_counter=similatrity
            shingle_question=question
    #print(shingle_question)      
    row['Shingles_question'] = str(exp.loc[exp['Theme column'] == shingle_question, ['Question']].values[0]).replace('[\'','').replace('\']','').lower()
    row['Shingles_answer'] = str(exp.loc[exp['Theme column'] == shingle_question, ['Answer']].values[0]).replace('[\'','').replace('\']','').lower()


### Поскольку cos-ое расстояние и список всех векторов требует неадекватно много времени:
## Сравним алгоритмы со стандартным-реализованным решением


In [34]:
import difflib

def similarity(s1, s2):
    normalized1 = s1.lower()
    normalized2 = s2.lower()
    matcher = difflib.SequenceMatcher(None, normalized1, normalized2)
    return matcher.ratio()


In [35]:
test_exp['difflib_question'] = '' #предикт вопроса по difflib
test_exp['difflib_answer'] = ''
difflib_counter=0
difflib_question=''
for index, row in test_exp.iterrows():
    for question in text:
        if(similarity(question, row['Theme column'])>=difflib_counter):
            difflib_counter=similarity(question, row['Theme column'])
            difflib_question= question
    row['difflib_question'] = str(exp.loc[exp['Theme column'] == difflib_question, ['Question']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()
    row['difflib_answer'] = str(exp.loc[exp['Theme column'] == difflib_question, ['Answer']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()


In [36]:
test_exp.head(50)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column,Levin_question,Levin_answer,Shingles_question,Shingles_answer,difflib_question,difflib_answer
0,Abraham Lincoln,Did John C. Breckinridge was beated by Lincoln...,yes,john c. breckinridge beated lincoln 1860 elec...,did lincoln beat john c. breckinridge in the 1...,yes,did lincoln beat john c. breckinridge in the 1...,yes,did lincoln beat john c. breckinridge in the 1...,yes
1,Abraham Lincoln,Did Lincoln's political career was started in ...,yes,lincoln's political career started 1832,did lincoln start his political career in 1832?,yes,did lincoln start his political career in 1832?,yes,did lincoln start his political career in 1832?,yes
2,Abraham Lincoln,Did the election of 1880 was won by Lincoln?,yes,election 1880 won lincoln,did lincoln win the election of 1860?,yes,did lincoln win the election of 1860?,yes,did lincoln start his political career in 1832?,yes
3,Abraham Lincoln,For what reason the Emancipation Proclamation...,to free slaves,reason emancipation proclamation issued lincoln,why did lincoln issue the emancipation proclam...,to free slaves,why did lincoln issue the emancipation proclam...,to free slaves,did lincoln start his political career in 1832?,yes
4,Abraham Lincoln,Is Lincoln the first President of the United S...,no,lincoln president united states,was abraham lincoln the first president of the...,no,was abraham lincoln the first president of the...,no,was abraham lincoln the first president of the...,no
5,Abraham Lincoln,Was pneumonia a cause of death of Lincoln's mo...,no,pneumonia cause death lincoln's mother,did his mother die of pneumonia?,no,did his mother die of pneumonia?,no,was abraham lincoln the first president of the...,no
6,Abraham Lincoln,Was the National Banking Act of 1863 signed by...,yes,national banking act 1863 signed lincoln,did lincoln sign the national banking act of 1...,yes,did lincoln sign the national banking act of 1...,yes,was abraham lincoln the first president of the...,no
7,Abraham Lincoln,When Lincoln's political career was started?,1832,lincoln's political career started,did lincoln start his political career in 1832?,yes,did lincoln start his political career in 1832?,yes,was abraham lincoln the first president of the...,no
8,Alessandro Volta,Did Austria rule Lombard before 1796?,yes,austria rule lombard 1796,"before 1796, was lombardy ruled by austria?",yes,"before 1796, was lombardy ruled by austria?",yes,was abraham lincoln the first president of the...,no
9,Alessandro Volta,Did electricity was Volta's passion?,yes,electricity volta's passion,is it true that his passion been always the st...,yes,did volta have a passion for the study of elec...,yes,was abraham lincoln the first president of the...,no


In [37]:
# 0 если ответы не сошлись, 1 если сошлись
test_exp['Levin_count'] = ''
test_exp['Shingle_count'] = ''
test_exp['difflib_count'] = ''
for index, row in test_exp.iterrows():
    
    if(row['Levin_answer']==row['Answer']):
        row['Levin_count']=1
    else:
        row['Levin_count']=0

    if(row['Shingles_answer']==row['Answer']):
        row['Shingle_count']=1
    else:
        row['Shingle_count']=0
    
    if(row['difflib_answer']==row['Answer']):
        row['difflib_count']=1
    else:
        row['difflib_count']=0


In [38]:
test_exp.head(50)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column,Levin_question,Levin_answer,Shingles_question,Shingles_answer,difflib_question,difflib_answer,Levin_count,Shingle_count,difflib_count
0,Abraham Lincoln,Did John C. Breckinridge was beated by Lincoln...,yes,john c. breckinridge beated lincoln 1860 elec...,did lincoln beat john c. breckinridge in the 1...,yes,did lincoln beat john c. breckinridge in the 1...,yes,did lincoln beat john c. breckinridge in the 1...,yes,1,1,1
1,Abraham Lincoln,Did Lincoln's political career was started in ...,yes,lincoln's political career started 1832,did lincoln start his political career in 1832?,yes,did lincoln start his political career in 1832?,yes,did lincoln start his political career in 1832?,yes,1,1,1
2,Abraham Lincoln,Did the election of 1880 was won by Lincoln?,yes,election 1880 won lincoln,did lincoln win the election of 1860?,yes,did lincoln win the election of 1860?,yes,did lincoln start his political career in 1832?,yes,1,1,1
3,Abraham Lincoln,For what reason the Emancipation Proclamation...,to free slaves,reason emancipation proclamation issued lincoln,why did lincoln issue the emancipation proclam...,to free slaves,why did lincoln issue the emancipation proclam...,to free slaves,did lincoln start his political career in 1832?,yes,1,1,0
4,Abraham Lincoln,Is Lincoln the first President of the United S...,no,lincoln president united states,was abraham lincoln the first president of the...,no,was abraham lincoln the first president of the...,no,was abraham lincoln the first president of the...,no,1,1,1
5,Abraham Lincoln,Was pneumonia a cause of death of Lincoln's mo...,no,pneumonia cause death lincoln's mother,did his mother die of pneumonia?,no,did his mother die of pneumonia?,no,was abraham lincoln the first president of the...,no,1,1,1
6,Abraham Lincoln,Was the National Banking Act of 1863 signed by...,yes,national banking act 1863 signed lincoln,did lincoln sign the national banking act of 1...,yes,did lincoln sign the national banking act of 1...,yes,was abraham lincoln the first president of the...,no,1,1,0
7,Abraham Lincoln,When Lincoln's political career was started?,1832,lincoln's political career started,did lincoln start his political career in 1832?,yes,did lincoln start his political career in 1832?,yes,was abraham lincoln the first president of the...,no,0,0,0
8,Alessandro Volta,Did Austria rule Lombard before 1796?,yes,austria rule lombard 1796,"before 1796, was lombardy ruled by austria?",yes,"before 1796, was lombardy ruled by austria?",yes,was abraham lincoln the first president of the...,no,1,1,0
9,Alessandro Volta,Did electricity was Volta's passion?,yes,electricity volta's passion,is it true that his passion been always the st...,yes,did volta have a passion for the study of elec...,yes,was abraham lincoln the first president of the...,no,1,1,0


In [39]:
print('Левенштейн точность')
test_exp['Levin_count'].sum()/test_exp['Levin_count'].count()

Левенштейн точность


0.6666666666666666

In [40]:
print('Шинглы точность')
test_exp['Shingle_count'].sum()/test_exp['Shingle_count'].count()

Шинглы точность


0.6976744186046512

In [41]:
print('difflib точность')
test_exp['difflib_count'].sum()/test_exp['difflib_count'].count()

difflib точность


0.11627906976744186

In [42]:

trace1 = go.Bar(
    x=['Шинглы', 'Левенштейн', 'difflib'],
    y=[test_exp['Levin_count'].sum(), test_exp['Shingle_count'].sum(), test_exp['difflib_count'].sum()],
    name='Успех'
)
trace2 = go.Bar(
    x=['Шинглы', 'Левенштейн', 'difflib'],
    y=[test_exp['Levin_count'].count()-test_exp['Levin_count'].sum(),test_exp['Shingle_count'].count()- test_exp['Shingle_count'].sum(), test_exp['difflib_count'].count()-test_exp['difflib_count'].sum()],
    name='Ошибка'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~MaximGilman/0 or inside your plot.ly account where it is named 'grouped-bar'



Consider using IPython.display.IFrame instead



In [43]:
test_exp.to_csv('result.csv', sep=';')