In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install fuzzywuzzy[speedup]
!pip install plotly

In [27]:
import pandas as pd
import re
#import pymorphy2
import math
import spacy
from collections import Counter
import numpy as np
import scipy.spatial
import collections
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import plotly.plotly as py
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='MaximGilman', api_key='vlKaZaRyFpNaDPYD0hiv')


# Чтение данных
Сет распределен по нескольким файлам, поэтому необходимо их объединить.
Сет содержит данные о сложности вопроса по мнению спрашивающего и отвечающего и номер статьи, из которой взят ответ - нам это не нужно

In [33]:
answers1 = pd.read_csv('S08_question_answer_pairs.txt', sep='\t')
answers2 = pd.read_csv('S09_question_answer_pairs.txt', sep='\t')
answers3 = pd.read_csv('S10_question_answer_pairs.txt', sep='\t', encoding='latin-1')

answers = pd.concat([answers1, answers2, answers3])
answers.drop(labels=['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleFile'], axis=1, inplace=True)
answers.head(20)

Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Abraham_Lincoln,Did his mother die of pneumonia?,no
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.


In [34]:
print('Наличие null:', answers.isnull().values.any())
print('Наличие NaN:', answers.isna().values.any())

Наличие null: True
Наличие NaN: True


# Первичная обработка данных
## Обработка дубликатов и пустых значений

1. Сет содержит вопросы без ответа
2. Сет содержит повторяющиеся вопросы 

Дубликаты неоходимо удалить - они не несут дополнительной смысловой нагрузки, но увеличивают объем для обработки.
Пары с пустыми значениями (без ответа) не могут принеси пользы для модели + нельзя выполнить замену для пустых начени, поскольку необходимо знать правилный ответ. В результате остается только удалить такие вопросы.

Сначала удаляем вопросы без ответа, затем повторяющиеся вопросы

In [35]:
print('Размер начального сета', answers.size)

#предполагается, что нам необходимо заполнение всех полей
not_null_answers = answers.dropna(axis=0, how='any')
print('Размер сета без пустых значений', not_null_answers.size)

# при удалении дупликатов оставляем только первое вхождение
no_duplicates_answers = not_null_answers.drop_duplicates(subset='Question', keep='first')
print('Размер сета без дубликатов', no_duplicates_answers.size)
no_duplicates_answers.head(20)


Размер начального сета 11991
Размер сета без пустых значений 10263
Размер сета без дубликатов 6606


Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
4,Abraham_Lincoln,Did his mother die of pneumonia?,no
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832
10,Abraham_Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren..."
12,Abraham_Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell
14,Abraham_Lincoln,When did the Gettysburg address argue that Ame...,1776
16,Abraham_Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes
18,Abraham_Lincoln,Was Abraham Lincoln the first President of the...,No


In [36]:
print('Наличие null:', no_duplicates_answers.isnull().values.any())
print('Наличие NaN:', no_duplicates_answers.isna().values.any())

Наличие null: False
Наличие NaN: False


## Обработка притяжательных местоимений

1. В сете содержатся указания на одушевленный предмет статьи -his/her, he/she 
#####    (напр. Was his (Alessandro_Volta) 1800 paper written in French ?)
2. В сете содержатся указания на неодушевленный предмет статьи -this
#####    (напр. 	What connected the Akans to this (Ghana) Empire?)

Производится замена местоимений на тему, однако только в тех случаях, когда тема не содержится в вопросе и это местоимение является явным на неё указателем: 
##### (напр. "When did Lincoln begin his political career?").
В данном случае замена не производится.

In [37]:
nlp = spacy.load('en_core_web_sm')

#no_duplicates_answers['Question'] =no_duplicates_answers['Question'].replace(regex=[r' his | her | this '], 
#                                                                           value=' замена ')
exp = no_duplicates_answers.copy()
exp['Theme column'] = ''
for index, row in exp.iterrows():
    row['ArticleTitle'] = row['ArticleTitle'].replace('_', ' ')
        
    title_words = list(map(lambda x: x.lower(),row['ArticleTitle'].split()))
    
    #question_words = re.findall(r"[a-zA-Z]+-[a-zA-Z]+|[a-zA-Z]+\'[a-zA-Z]+|[a-zA-Z]+", row['Question'])
    
    doc = nlp(row['Question'])
    infinitives = [token.lemma_ for token in doc]
    '''
    infinitives = []
    morph = pymorphy2.MorphAnalyzer()
    for word in question_words:
        word = word.lower()
        infinitives.append(morph.parse(word)[0].normal_form)        
    '''    
    
    infinitives = list(map(lambda x: x.lower(), infinitives))
    has_title_in_question = any(map(lambda x: x in list(map(lambda x: x.lower(), infinitives)), title_words))
    
    if (not has_title_in_question):      
        '''
        print(title_words)
        print(infinitives)
        print(row['ArticleTitle'], row['Question'])
        s = row['Question'].lower()        
        print(re.sub(r'( he ?)|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip())
        ''' 
        s = row['Question'].lower()  
        row['Theme column'] = re.sub(r'( he )|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip()
        
exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,
4,Abraham Lincoln,Did his mother die of pneumonia?,no,did Abraham Lincoln mother die of pneumonia?
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",what did the legal tender act of 1862 establish?
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,when did the gettysburg address argue that ame...
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,


### Перенесем все неизмененные значения из столбца Question в Theme column. С последней и будем в дальнейшем работать

In [38]:
for index, row in exp.iterrows():
    #print(row['Theme column'])
    row['Theme column'] = row['Theme column'].lower() if row['Theme column'] != '' else row['Question'].lower() 
    row['Theme column'] =   row['Theme column'].replace('?','').strip() 
exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,was abraham lincoln the sixteenth president of...
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,did lincoln sign the national banking act of 1863
4,Abraham Lincoln,Did his mother die of pneumonia?,no,did abraham lincoln mother die of pneumonia
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,how many long was lincoln's formal education
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,when did lincoln begin his political career
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",what did the legal tender act of 1862 establish
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,who suggested lincoln grow a beard
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,when did the gettysburg address argue that ame...
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,did lincoln beat john c. breckinridge in the 1...
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,was abraham lincoln the first president of the...


# Перейдем к Токенизации для последующего TF-IDF и WB


In [39]:
text = ' '.join(exp['Theme column'])
tokens = re.findall(r"[\w']+", text.lower())
print('\nTokens:\n\n', tokens[:500])



Tokens:

 ['was', 'abraham', 'lincoln', 'the', 'sixteenth', 'president', 'of', 'the', 'united', 'states', 'did', 'lincoln', 'sign', 'the', 'national', 'banking', 'act', 'of', '1863', 'did', 'abraham', 'lincoln', 'mother', 'die', 'of', 'pneumonia', 'how', 'many', 'long', 'was', "lincoln's", 'formal', 'education', 'when', 'did', 'lincoln', 'begin', 'his', 'political', 'career', 'what', 'did', 'the', 'legal', 'tender', 'act', 'of', '1862', 'establish', 'who', 'suggested', 'lincoln', 'grow', 'a', 'beard', 'when', 'did', 'the', 'gettysburg', 'address', 'argue', 'that', 'america', 'was', 'born', 'did', 'lincoln', 'beat', 'john', 'c', 'breckinridge', 'in', 'the', '1860', 'election', 'was', 'abraham', 'lincoln', 'the', 'first', 'president', 'of', 'the', 'united', 'states', 'did', 'lincoln', 'start', 'his', 'political', 'career', 'in', '1832', 'did', 'lincoln', 'ever', 'represent', 'alton', 'sangamon', 'railroad', 'which', 'county', 'was', 'lincoln', 'born', 'in', 'when', 'did', 'lincoln', 'fi

### Счетчик слов:


In [40]:
term_freq_dict = Counter(tokens)
term_freq_tuple = [(key, value) for key, value in term_freq_dict.items()]
sorted_count_words =sorted(term_freq_tuple, key=lambda x: x[1], reverse=True)

In [41]:
words = list(map(lambda x: x[0], sorted_count_words)) 
count = list(map(lambda x: x[1], sorted_count_words)) 
dict_count = dict(zip(words, count)) 

py.iplot([go.Bar(x=list(dict_count.keys()), y=list(dict_count.values()))])


### В данном случае датасет имеет большое количество слов, поэтому график нуждается в увеличении. 
### При увеличении графика обнаружено, что самыми распространенными словами являются стандартные стоп-слова. Однако в эту же выборку попали слова, являющиеся важными для формулировки вопроса. Например, when и where. Если мы будем считать их стоп-словами и будем удалять, то вопросы "Когда родился Линкольн?" и "Где родился Линкольн?" станут идентичными. То есть, необходимо внимательнее присмотреться к потенциальным стоп-словам

In [42]:
Q1, Q2, Q3 = np.percentile(list(term_freq_dict.values()), [25, 50, 75])
IQR = Q3 - Q1
lower_inner_fence = Q1 - (1.5 * IQR)
lower_outer_fence = Q1 - (3 * IQR)
upper_inner_fence = Q3 + (1.5 * IQR)
upper_outer_fence = Q3 + (3 * IQR)
print(f'''upper_outer_fence:  {upper_outer_fence}, 
upper_inner_fence:  {upper_inner_fence},
lower_inner_fence:  {lower_inner_fence},
lower_outer_fence"  {lower_outer_fence}''')

upper_outer_fence:  9.0, 
upper_inner_fence:  6.0,
lower_inner_fence:  -2.0,
lower_outer_fence"  -5.0


## Теперь немного подправим список стоп-слов (но только не значащие). 
### В нашем случае (в английском языке) — это напр. артикли.
### Было решено считать за стоп слова  все стандарнтые + слова с частотой при токенизации >50.
#### Но с некоторыми исключениями: мы не удаляем уточняющие слова - what/where/e.t.c. (в финальной версии модели удаляем и их)
#### Более того, можно оставлять в списке только основную форму слова (напр. глагола become), поскольку данные в сете будут Лемматизированны.



#### Произведем изменение списка стоп-слов

In [43]:
my_stop_words = nlp.Defaults.stop_words
print('\nИзначальный набор\n',len(my_stop_words))
#добавление отловленных слов из списка токенизированных, кроме when/where, language 
my_stop_words|= {'the','of','is','a', 'was','to','are','and','do','for','does','have','as',
                 'an','are','many','he','she','it','on','his','her','by','with','that','this','most','at','there','short','long',
                '\'s','.','?','!',',','&'}
#удаление лишних слов.  Удаляем только одну форму - become (см выше)
my_stop_words-= {'amount', 'another', 'anything', 'become'}
'what',
'when',
'where',
'whereafter',
'which',
'who',
'whom',
'whose',
'why',


my_stop_words


Изначальный набор
 326


{'!',
 '&',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 ',',
 '.',
 '?',
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'any',
 'anyhow',
 'anyone',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'fu

### Удаляем стоп-слова из сета

In [44]:
for index, row in exp.iterrows():
    new_row = ''
    for word in  row['Theme column'].split():
        if word not in my_stop_words:
            new_row += ' '+word
    row['Theme column']=new_row
exp.head(50)




Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,abraham lincoln sixteenth president united st...
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,lincoln sign national banking act 1863
4,Abraham Lincoln,Did his mother die of pneumonia?,no,abraham lincoln mother die pneumonia
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,lincoln's formal education
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,lincoln begin political career
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",legal tender act 1862 establish
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,suggested lincoln grow beard
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,gettysburg address argue america born
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,lincoln beat john c. breckinridge 1860 election
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,abraham lincoln president united states


# МЕШОК СЛОВ

In [45]:
full_tokens = set(' '.join(tokens).split())
vectors = [[sentence.count(token) for token in tokens]
           for sentence in tokens]
print([(i, word) for i, word in enumerate(full_tokens)])
#vectors

KeyboardInterrupt: 

In [None]:
test_sentence = vectors[1] #choose any
for i, sentence in enumerate(tokens):
    print(sentence)
    print(scipy.spatial.distance.cosine(test_sentence, vectors[i]))
    

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = list('|'.join(text.split()).split('|'))

vectorizer = CountVectorizer()
print( vectorizer.fit_transform(corpus).todense() )
print( vectorizer.vocabulary_ )

# Алгоритм Шинглов — поиск нечетких дубликатов текста


In [46]:
def genshingle(source):
    import binascii
    shingleLen = 3 #длина шингла - 3--5--7
    out = [] 
    for i in range(len(source)-(shingleLen-1)):
        out.append (binascii.crc32(' '.join( [x for x in source[i:i+shingleLen]] ).encode('utf-8')))

    return out

def compaire(source1,source2):
    same = 0
    for i in range(len(source1)):
        if source1[i] in source2:
            same = same + 1

    return same*2/float(len(source1) + len(source2))*100

def test():
    text1 = exp.iloc[0]["Theme column"] # Текст 1 для сравнения - abraham lincoln sixteenth president united states
    text2 = 'lincoln sixteenth president' # Текст 2 для сравнения - обработанный
    text3 = 'was abraham linсoln the 16\'th president of Ameriсa' # Текст 3 для сравнения -не очищенный
    text4 = 'barak obama' # Текст 4 - совсем другой

    cmp1 = genshingle(text1)
    cmp2 = genshingle(text2)
    cmp3 = genshingle(text3)
    cmp4 = genshingle(text4)

    print ('\n'+text1)
    print (text2)


    print ('\n pretty near '+str(compaire(cmp1,cmp2)))
    
    print ('\n'+text1)
    print (text3)
    print ('\n Not pretty near '+str(compaire(cmp1,cmp3)))
    
    print ('\n'+text1)
    print (text4)
    print ('\n Not near almost '+str(compaire(cmp1,cmp4)))

# Start program
test()


 abraham lincoln sixteenth president united states
lincoln sixteenth president

 pretty near 68.4931506849315

 abraham lincoln sixteenth president united states
was abraham linсoln the 16'th president of Ameriсa

 Not pretty near 50.0

 abraham lincoln sixteenth president united states
barak obama

 Not near almost 0.0


# TF, IDF и TF-IDF

In [47]:
def compute_tf(text):
    #Считаем частотность всех терминов во входном массиве с помощью 
    #метода Counter библиотеки collections, но уже в очищенной Theme column
    tf_text = collections.Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i]/float(len(text))
    return tf_text
def compute_idf(word, corpus):
#на вход берется слово, для которого считаем IDF
#и корпус документов в виде списка списков слов
        #количество документов, где встречается искомый термин
        #считается как генератор списков
        return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))
def compute_idf_another(word, corpus):
    data = [Counter(i) for i in corpus if word in i]
    final_counter = Counter()
    for i in data:
        final_counter += i
    most_common_word = final_counter.most_common(1)[0][1]
    return math.log10(1 + (most_common_word/float(sum([1 for i in corpus if word in i]))))

def compute_tfidf(corpus):

    documents_list = []
    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list

def compute_tfidf_another(corpus):

    documents_list = []
    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf_another(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list


In [48]:
text = exp["Theme column"]
text_separated_by_word = list(map(lambda x: x.split(),'|'.join(text).split('|')))
tf=compute_tf(text)
#print(tf)
 

In [49]:
print(compute_idf('president',text_separated_by_word))


1.8656960599160706


In [50]:
tfidf=compute_tfidf(text_separated_by_word)

print(tfidf)

[{'abraham': 0.45679288721796174, 'lincoln': 0.3333991056355877, 'sixteenth': 0.5571362191059555, 'president': 0.3109493433193451, 'united': 0.35206139887624316, 'states': 0.356449555329968}, {'lincoln': 0.3333991056355877, 'sign': 0.4776160099860117, 'national': 0.3479241349220711, 'banking': 0.5571362191059555, 'act': 0.4406412183832856, '1863': 0.5571362191059555}, {'abraham': 0.5481514646615542, 'lincoln': 0.40007892676270534, 'mother': 0.46856346292714657, 'die': 0.41750896190648534, 'pneumonia': 0.6685634629271466}, {"lincoln's": 0.9552320199720234, 'formal': 0.9552320199720234, 'education': 0.8812824367665713}, {'lincoln': 0.5000986584533816, 'begin': 0.6244298186553691, 'political': 0.597143701299102, 'career': 0.7164240149790176}, {'legal': 0.5481514646615542, 'tender': 0.6685634629271466, 'act': 0.5287694620599428, '1862': 0.6685634629271466, 'establish': 0.5731392119832142}, {'suggested': 0.8357043286589333, 'lincoln': 0.5000986584533816, 'grow': 0.7164240149790176, 'beard':




In [51]:

tfidf_another=compute_tfidf_another(text_separated_by_word)

 
print(tfidf_another)

[{'abraham': 0.050171665943996864, 'lincoln': 0.050171665943996864, 'sixteenth': 0.050171665943996864, 'president': 0.052545072463098565, 'united': 0.050171665943996864, 'states': 0.050171665943996864}, {'lincoln': 0.050171665943996864, 'sign': 0.050171665943996864, 'national': 0.050171665943996864, 'banking': 0.050171665943996864, 'act': 0.050171665943996864, '1863': 0.050171665943996864}, {'abraham': 0.06020599913279624, 'lincoln': 0.06020599913279624, 'mother': 0.06020599913279624, 'die': 0.06020599913279624, 'pneumonia': 0.06020599913279624}, {"lincoln's": 0.10034333188799373, 'formal': 0.10034333188799373, 'education': 0.10034333188799373}, {'lincoln': 0.0752574989159953, 'begin': 0.0752574989159953, 'political': 0.0752574989159953, 'career': 0.0752574989159953}, {'legal': 0.06020599913279624, 'tender': 0.06020599913279624, 'act': 0.06020599913279624, '1862': 0.06020599913279624, 'establish': 0.06020599913279624}, {'suggested': 0.0752574989159953, 'lincoln': 0.0752574989159953, 'g




# Косинусное расстояние

In [52]:
#немного переделенные методы для tf-idf 
import operator


def tokenize(doc):
    words = [word.replace(',', '').lower() for word in doc.split()]
    return words


def build_terms(corpus):
    terms = {}
    current_index = 0
    for doc in corpus:
        for word in tokenize(doc):
            if word not in terms:
                terms[word] = current_index
                current_index += 1
    return terms


def tf(document, terms):
    words = tokenize(document)
    total_words = len(words)
    doc_counter = Counter(words)
    for word in doc_counter:
        # Можно и не делить, а оставить как есть, с частотой
        doc_counter[word] /= total_words
    tfs = [0 for _ in range(len(terms))]
    for term, index in terms.items():
        tfs[index] = doc_counter[term]
    return tfs


def _count_docs_with_word(word, docs):
    counter = 1
    for doc in docs:
        if word in doc:
            counter += 1
    return counter


def idf(documents, terms):
    idfs = [0 for _ in range(len(terms))]
    total_docs = len(documents)
    for word, index in terms.items():
        docs_with_word = _count_docs_with_word(word, documents)
        idf = 1 + math.log10(total_docs / docs_with_word)
        idfs[index] = idf
    return idfs


def _merge_td_idf(tf, idf, terms):
    return [tf[i] * idf[i] for i in range(len(terms))]


def build_tfidf(corpus, document, terms):
    doc_tf = tf(document, terms)
    doc_idf = idf(corpus, terms)
    return _merge_td_idf(doc_tf, doc_idf, terms)


def cosine_similarity(vec1, vec2):
    def dot_product2(v1, v2):
        return sum(map(operator.mul, v1, v2))

    def vector_cos5(v1, v2):
        prod = dot_product2(v1, v2)
        len1 = math.sqrt(dot_product2(v1, v1))
        len2 = math.sqrt(dot_product2(v2, v2))
        return prod / (len1 * len2)
         

    return vector_cos5(vec1, vec2)







In [53]:
tf_idf_total = []
corpus = (tuple(text))[:100]
terms = build_terms(corpus)

for document in corpus:
    tf_idf_total.append(build_tfidf(corpus, document, terms))

#for doc_rating in tf_idf_total:
    #print(doc_rating)
print(terms.keys())
query = 'president'
print("QUERY:",query )
query_tfidf = build_tfidf(corpus, query, terms)
for index, document in enumerate(tf_idf_total):
    print("Similarity with DOC", index, "=", cosine_similarity(query_tfidf, document))

dict_keys(['abraham', 'lincoln', 'sixteenth', 'president', 'united', 'states', 'sign', 'national', 'banking', 'act', '1863', 'mother', 'die', 'pneumonia', "lincoln's", 'formal', 'education', 'begin', 'political', 'career', 'legal', 'tender', '1862', 'establish', 'suggested', 'grow', 'beard', 'gettysburg', 'address', 'argue', 'america', 'born', 'beat', 'john', 'c.', 'breckinridge', '1860', 'election', 'start', '1832', 'represent', 'alton', 'sangamon', 'railroad', 'county', 'serve', 'assassinated', 'win', 'general', 'charge', 'battle', 'antietam', 'issue', 'emancipation', 'proclamation', 'scholars', 'rank', 'presidents', '18', 'months', 'schooling', 'chosen', 'presidential', 'candidate', 'old', '1816', 'photgraph', 'taken', 'trail', 'use', "farmers'", 'almanac', 'live', 'frontier', "wife's", 'family', 'support', 'slavery', 'noted', 'amedeo', 'avogadro', 'contributions', 'theory', 'molarity', 'molecular', 'weight', 'graduated', 'ecclesiastical', 'law', 'early', 'age', '20', 'began', 'prac

# Расстояние Ливенштейна

In [54]:
exmpl = exp.iloc[0]["Theme column"]
print(exmpl)
print(fuzz.token_sort_ratio(exmpl, 'lincoln sixteenth president'))
print(fuzz.token_set_ratio(exmpl, 'lincoln sixteenth president '))


 abraham lincoln sixteenth president united states
71
100


# Использование моделей для непосредственного поиска

In [55]:
input_example = 'Did the election of 1880 was won by Lincoln?'

#clear input
clear_input_exmpl=''
new_row = ''

#Stop words

for word in  input_example.split():
   
    if word not in my_stop_words:
        new_row += ' '+word
        clear_input_exmpl=new_row.strip() .lower().replace('?','')

print(clear_input_exmpl) 


did election 1880 won lincoln


## Левенштейн

In [56]:
levin_counter=0
levin_question=''
for question in text:
    if(fuzz.token_set_ratio(question, clear_input_exmpl)>=levin_counter):
        levin_counter=fuzz.token_set_ratio(question, clear_input_exmpl)
#       levin_counter=fuzz.token_sort_ratio(question, clear_input_exmpl)
        levin_question= question

        
print('it is most similar to:')
print(levin_question)
print('Levenshtein distance is:')
print(levin_counter)
print('Quesion And Answer are:')
exp.loc[exp['Theme column'] == levin_question, ['Question','Answer']]

it is most similar to:
 lincoln win election 1860
Levenshtein distance is:
85
Quesion And Answer are:


Unnamed: 0,Question,Answer
30,Did Lincoln win the election of 1860?,Yes


#### Поиск возможных ответов с пониженным расстоянием Левинштейна

In [57]:
possible_delta= 2
all_levenstain_distance = list((map(lambda x: fuzz.token_set_ratio(x, clear_input_exmpl),exp['Theme column'])))
                             #  >levin_counter-possible_delta,exp['Theme column']))

lev_possible_answers = list(map(lambda x: x>= levin_counter-possible_delta,all_levenstain_distance))
lev_possible_answers_with_distance = list(zip(exp['Theme column'][lev_possible_answers],
                                          exp['Answer'][lev_possible_answers],
                                          pd.Series(all_levenstain_distance)[lev_possible_answers]))
print('\nAlso possible questions are:')
lev_possible_answers_with_distance


Also possible questions are:


[(' lincoln win election 1860', 'Yes', 85)]

## Шинглы

In [58]:
shingle_counter=0
shingle_question=''
main_cmp = genshingle(clear_input_exmpl)
for question in text:
    tmp_cmp = genshingle(question)
    similatrity=compaire(main_cmp,tmp_cmp)
    if(similatrity>shingle_counter):
            shingle_counter=similatrity
            shingle_question=question
print('it is most similar to:')
print(shingle_question)
print('Shingle similarity procent is:')
print(shingle_counter)
print('Quesion And Answer are:')
exp.loc[exp['Theme column'] == shingle_question, ['Question','Answer']]

it is most similar to:
 lincoln win election 1860
Shingle similarity procent is:
66.66666666666666
Quesion And Answer are:


Unnamed: 0,Question,Answer
30,Did Lincoln win the election of 1860?,Yes


#### Поиск возможных ответов с пониженным показетелем схожести Шинглов

In [59]:
possible_delta= 0.05
all_shingle_distance = list((map(lambda x: compaire(main_cmp,genshingle(x)),exp['Theme column'])))
                           

Sh_possible_answers = list(map(lambda x: x>= shingle_counter-possible_delta,all_shingle_distance))
Sh_possible_answers_with_distance = list(zip(exp['Theme column'][Sh_possible_answers],
                                          exp['Answer'][Sh_possible_answers],
                                          pd.Series(all_shingle_distance)[Sh_possible_answers]))
print('\nAlso possible questions are:')
Sh_possible_answers_with_distance


Also possible questions are:


[(' lincoln win election 1860', 'Yes', 66.66666666666666)]

## TF-IDF

In [60]:
print('See is it a low coefficient in these words?:')
print('\n for Levenstain')
print(pd.Series(tfidf)[lev_possible_answers].values[0])
print(pd.Series(tfidf_another)[lev_possible_answers].values[0])
###
print('\n for Shingles')
print(pd.Series(tfidf)[Sh_possible_answers].values[0])
print(pd.Series(tfidf_another)[Sh_possible_answers].values[0])

See is it a low coefficient in these words?:

 for Levenstain
{'lincoln': 0.5000986584533816, 'win': 0.565909017147027, 'election': 0.6411665160630223, '1860': 0.6851893308269427}
{'lincoln': 0.0752574989159953, 'win': 0.0752574989159953, 'election': 0.0752574989159953, '1860': 0.0752574989159953}

 for Shingles
{'lincoln': 0.5000986584533816, 'win': 0.565909017147027, 'election': 0.6411665160630223, '1860': 0.6851893308269427}
{'lincoln': 0.0752574989159953, 'win': 0.0752574989159953, 'election': 0.0752574989159953, '1860': 0.0752574989159953}


In [61]:
clear_input_exmpl


'did election 1880 won lincoln'

# Проверка данных на тестовой выборке

## Очистка и обработка данных

In [62]:
test_answers_1 = pd.read_csv('test_questions.csv', sep=';', encoding='latin-1')

test_answers = pd.concat([test_answers_1])

test_exp=test_answers.copy()
test_exp['Theme column'] = ''
#remove words
for index, row in test_exp.iterrows():
    row['ArticleTitle'] = row['ArticleTitle'].replace('_', ' ')
    row['Answer'] = row['Answer'].lower()    
    title_words = list(map(lambda x: x.lower(),row['ArticleTitle'].split()))
    
    
    doc = nlp(row['Question'])
    infinitives = [token.lemma_ for token in doc]
      
    infinitives = list(map(lambda x: x.lower(), infinitives))
    has_title_in_question = any(map(lambda x: x in list(map(lambda x: x.lower(), infinitives)), title_words))
    
    if (not has_title_in_question):      
        s = row['Question'].lower()  
        row['Theme column'] = re.sub(r'( he )|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip()
#move all to Tc
for index, row in test_exp.iterrows():
    row['Theme column'] = row['Theme column'].lower() if row['Theme column'] != '' else row['Question'].lower() 
    row['Theme column'] =   row['Theme column'].replace('?','').strip() 
#Stop words
for index, row in test_exp.iterrows():
    new_row = ''
    for word in  row['Theme column'].split():
        if word not in my_stop_words:
            new_row += ' '+word
    row['Theme column']=new_row    
test_exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Did John C. Breckinridge was beated by Lincoln...,yes,john c. breckinridge beated lincoln 1860 elec...
1,Abraham Lincoln,Did Lincoln's political career was started in ...,yes,lincoln's political career started 1832
2,Abraham Lincoln,Did the election of 1880 was won by Lincoln?,yes,election 1880 won lincoln
3,Abraham Lincoln,For what reason the Emancipation Proclamation...,to free slaves,reason emancipation proclamation issued lincoln
4,Abraham Lincoln,Is Lincoln the first President of the United S...,no,lincoln president united states
5,Abraham Lincoln,Was pneumonia a cause of death of Lincoln's mo...,no,pneumonia cause death lincoln's mother
6,Abraham Lincoln,Was the National Banking Act of 1863 signed by...,yes,national banking act 1863 signed lincoln
7,Abraham Lincoln,When Lincoln's political career was started?,1832,lincoln's political career started
8,Alessandro Volta,Did Austria rule Lombard before 1796?,yes,austria rule lombard 1796
9,Alessandro Volta,Did electricity was Volta's passion?,yes,electricity volta's passion


# Для каждого способа составим вероятный вопрос
### В столбцах содержатся вероятные вопросы

In [63]:
test_exp['Levin_question'] = '' #предикт вопроса по левинштейну
test_exp['Levin_answer'] = ''
test_exp['Shingles_question'] = '' #предикт вопроса по шинглам
test_exp['Shingles_answer'] = ''

for index, row in test_exp.iterrows():
    
    #levin
    levin_counter=0
    levin_question=''
    for question in text:
        if(fuzz.token_set_ratio(question, row['Theme column'])>=levin_counter):
            levin_counter=fuzz.token_set_ratio(question, row['Theme column'])
            levin_question= question
    row['Levin_question'] = str(exp.loc[exp['Theme column'] == levin_question, ['Question']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()
    row['Levin_answer'] = str(exp.loc[exp['Theme column'] == levin_question, ['Answer']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()
    
    # shingle
    shingle_counter=0
    shingle_question=''
    main_cmp = genshingle(row['Theme column'])
    for question in text:
        tmp_cmp = genshingle(question)
        similatrity=compaire(main_cmp,tmp_cmp)
        if(similatrity>shingle_counter):
            shingle_counter=similatrity
            shingle_question=question
    #print(shingle_question)      
    row['Shingles_question'] = str(exp.loc[exp['Theme column'] == shingle_question, ['Question']].values[0]).replace('[\'','').replace('\']','').lower()
    row['Shingles_answer'] = str(exp.loc[exp['Theme column'] == shingle_question, ['Answer']].values[0]).replace('[\'','').replace('\']','').lower()


### Поскольку cos-ое расстояние и список всех векторов требует неадекватно много времени:
## Сравним алгоритмы со стандартным-реализованным решением


In [64]:
import difflib

def similarity(s1, s2):
    normalized1 = s1.lower()
    normalized2 = s2.lower()
    matcher = difflib.SequenceMatcher(None, normalized1, normalized2)
    return matcher.ratio()


In [None]:
test_exp['difflib_question'] = '' #предикт вопроса по difflib
test_exp['difflib_answer'] = ''
difflib_counter=0
difflib_question=''
for index, row in test_exp.iterrows():
    for question in text:
        if(similarity(question, row['Theme column'])>=difflib_counter):
            difflib_counter=similarity(question, row['Theme column'])
            difflib_question= question
    row['difflib_question'] = str(exp.loc[exp['Theme column'] == difflib_question, ['Question']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()
    row['difflib_answer'] = str(exp.loc[exp['Theme column'] == difflib_question, ['Answer']].values[0]).replace('[\'','').replace('\']','').replace('[\"','').replace('\"]','').lower()


In [None]:
test_exp.head(50)

In [None]:
# 0 если ответы не сошлись, 1 если сошлись
test_exp['Levin_count'] = ''
test_exp['Shingle_count'] = ''
test_exp['difflib_count'] = ''
for index, row in test_exp.iterrows():
    
    if(row['Levin_answer']==row['Answer']):
        row['Levin_count']=1
    else:
        row['Levin_count']=0

    if(row['Shingles_answer']==row['Answer']):
        row['Shingle_count']=1
    else:
        row['Shingle_count']=0
    
    if(row['difflib_answer']==row['Answer']):
        row['difflib_count']=1
    else:
        row['difflib_count']=0


In [None]:
test_exp.head(50)

In [None]:
print('Левинштейн точность')
test_exp['Levin_count'].sum()/test_exp['Levin_count'].count()

In [None]:
print('Шинглы точность')
test_exp['Shingle_count'].sum()/test_exp['Shingle_count'].count()

In [None]:
print('difflib точность')
test_exp['difflib_count'].sum()/test_exp['difflib_count'].count()

In [32]:

trace1 = go.Bar(
    x=['Шинглы', 'Левинштейн', 'difflib'],
    y=[test_exp['Levin_count'].sum(), 14, 23],
    name='Успех'
)
trace2 = go.Bar(
    x=['Шинглы', 'Левинштейн', 'difflib'],
    y=[test_exp['Levin_count'].count()-test_exp['Levin_count'].sum(), 18, 29],
    name='Ошибка'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

NameError: name 'test_exp' is not defined

In [None]:
test_exp.to_csv('result.csv', sep=';')