In [2]:
import pandas as pd
import re
#import pymorphy2
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


# Чтение данных
Сет распределен по нескольким файлам, поэтому необходимо их объединить.
Сет содержит данные о сложности вопроса по мнению спрашивающего и отвечающего и номер статьи, из которой взят ответ - нам это не нужно

In [3]:
answers1 = pd.read_csv('S08_question_answer_pairs.txt', sep='\t')
answers2 = pd.read_csv('S09_question_answer_pairs.txt', sep='\t')
answers3 = pd.read_csv('S10_question_answer_pairs.txt', sep='\t', encoding='latin-1')

answers = pd.concat([answers1, answers2, answers3])
answers.drop(labels=['DifficultyFromQuestioner', 'DifficultyFromAnswerer', 'ArticleFile'], axis=1, inplace=True)
answers.head(20)

Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Abraham_Lincoln,Did his mother die of pneumonia?,no
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.


In [4]:
print('Наличие null:', answers.isnull().values.any())
print('Наличие NaN:', answers.isna().values.any())

Наличие null: True
Наличие NaN: True


# Первичная обработка данных
## Обработка дубликатов и пустых значений

1. Сет содержит вопросы без ответа
2. Сет содержит повторяющиеся вопросы 

Дубликаты неоходимо удалить - они не несут дополнительной смысловой нагрузки, но увеличивают объем для обработки.
Пары с пустыми значениями (без ответа) не могут принеси пользы для модели + нельзя выполнить замену для пустых начени, поскольку необходимо знать правилный ответ. В результате остается только удалить такие вопросы.

Сначала удаляем вопросы без ответа, затем повторяющиеся вопросы

In [5]:
print('Размер начального сета', answers.size)

#предполагается, что нам необходимо заполнение всех полей
not_null_answers = answers.dropna(axis=0, how='any')
print('Размер сета без пустых значений', not_null_answers.size)

# при удалении дупликатов оставляем только первое вхождение
no_duplicates_answers = not_null_answers.drop_duplicates(subset='Question', keep='first')
print('Размер сета без дубликатов', no_duplicates_answers.size)
no_duplicates_answers.head(20)


Размер начального сета 11991
Размер сета без пустых значений 10263
Размер сета без дубликатов 6606


Unnamed: 0,ArticleTitle,Question,Answer
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes
4,Abraham_Lincoln,Did his mother die of pneumonia?,no
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832
10,Abraham_Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren..."
12,Abraham_Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell
14,Abraham_Lincoln,When did the Gettysburg address argue that Ame...,1776
16,Abraham_Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes
18,Abraham_Lincoln,Was Abraham Lincoln the first President of the...,No


In [5]:
print('Наличие null:', no_duplicates_answers.isnull().values.any())
print('Наличие NaN:', no_duplicates_answers.isna().values.any())

Наличие null: False
Наличие NaN: False


## Обработка притяжательных местоимений

1. В сете содержатся указания на одушевленный предмет статьи -his/her, he/she 
#####    (напр. Was his (Alessandro_Volta) 1800 paper written in French ?)
2. В сете содержатся указания на неодушевленный предмет статьи -this
#####    (напр. 	What connected the Akans to this (Ghana) Empire?)

Производится замена местоимений на тему, однако только в тех случаях, когда тема не содержится в вопросе и это местоимение является явным на неё указателем: 
##### (напр. "When did Lincoln begin his political career?").
В данном случае замена не производится.

In [46]:
nlp = spacy.load('en_core_web_sm')

#no_duplicates_answers['Question'] =no_duplicates_answers['Question'].replace(regex=[r' his | her | this '], 
#                                                                           value=' замена ')
exp = no_duplicates_answers.copy()
exp['Theme column'] = ''
for index, row in exp.iterrows():
    row['ArticleTitle'] = row['ArticleTitle'].replace('_', ' ')
        
    title_words = list(map(lambda x: x.lower(),row['ArticleTitle'].split()))
    
    #question_words = re.findall(r"[a-zA-Z]+-[a-zA-Z]+|[a-zA-Z]+\'[a-zA-Z]+|[a-zA-Z]+", row['Question'])
    
    doc = nlp(row['Question'])
    infinitives = [token.lemma_ for token in doc]
    '''
    infinitives = []
    morph = pymorphy2.MorphAnalyzer()
    for word in question_words:
        word = word.lower()
        infinitives.append(morph.parse(word)[0].normal_form)        
    '''    
    
    infinitives = list(map(lambda x: x.lower(), infinitives))
    has_title_in_question = any(map(lambda x: x in list(map(lambda x: x.lower(), infinitives)), title_words))
    
    if (not has_title_in_question):      
        '''
        print(title_words)
        print(infinitives)
        print(row['ArticleTitle'], row['Question'])
        s = row['Question'].lower()        
        print(re.sub(r'( he ?)|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip())
        ''' 
        s = row['Question'].lower()  
        row['Theme column'] = re.sub(r'( he )|( his )|( her )|( its )]', ' '+ row['ArticleTitle'] + ' ', ' ' + s + ' ').strip()
        
exp.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,Theme column
0,Abraham Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,
2,Abraham Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,
4,Abraham Lincoln,Did his mother die of pneumonia?,no,did Abraham Lincoln mother die of pneumonia?
6,Abraham Lincoln,How many long was Lincoln's formal education?,18 months,
8,Abraham Lincoln,When did Lincoln begin his political career?,1832,
10,Abraham Lincoln,What did The Legal Tender Act of 1862 establish?,"the United States Note, the first paper curren...",what did the legal tender act of 1862 establish?
12,Abraham Lincoln,Who suggested Lincoln grow a beard?,11-year-old Grace Bedell,
14,Abraham Lincoln,When did the Gettysburg address argue that Ame...,1776,when did the gettysburg address argue that ame...
16,Abraham Lincoln,Did Lincoln beat John C. Breckinridge in the 1...,yes,
18,Abraham Lincoln,Was Abraham Lincoln the first President of the...,No,


In [48]:
for index, row in exp.iterrows():
    print(row['Theme column'])



did Abraham Lincoln mother die of pneumonia?


what did the legal tender act of 1862 establish?

when did the gettysburg address argue that america was born?








who was the general in charge at the battle of antietam?










who is most noted for Amedeo Avogadro contributions to the theory of molarity and molecular weight?
who graduated in ecclesiastical law at the early age of 20 and began to practice?
when did Amedeo Avogadro publish another memoria?
when did Amedeo Avogadro become a professor?
is it true that Amedeo Avogadro became a professor in 1820?


is Amedeo Avogadro most noted for Amedeo Avogadro contributions to the theory of molarity and molecular weight?


did the scientific community not reserve great attention to Amedeo Avogadro theory ?
can the title of this famous 1811 paper be roughly translated into english?
what happened in 1833?
who determined the dependence of the boiling of water with atmospheric pressure?
what is named after him?
when did Anders Celsiu

did the election of 1800 not become a bitter and volatile battle , with each side expressing extraordinary fear of the other party and its policies ?
what happened in 1764?


was james cook the first to record the name "kangooroo?"

where do joeys complete postnatal development?


what is responsible for converting the hydrogen byproduct of fermentation into acetate?





what are vehicles that frequent isolated roads often fitted with?




are kangaroos shy?
what method of locomotion do kangaroos use?

what is a roo?

what method is used by kangaroos to travel?
who asked a nearby local what the creatures were called?





have kangaroos dazzled by headlights or startled by engine noise been known to leap in front of cars ?
























what resembles that of the similarly-sized cougar in the americas?
what was one of the many species described in linnaeus's 18th-century work, systema naturae?
what sort of cats are solitary?


what centred in sierra?
felis pardus was what?
h

is it the second smallest independent country in south america , larger only than suriname and the french overseas department of french guiana?
is it a constitutional democracy , where the president fulfills the roles of both head of state and head of government?


did the u.s. join the league of nations?
where was the league of nations created?




what was more damaging than moving students into colleges?









what was scots-irish and scottish?
what defended slavery, owned slaves and s08_set up a sunday school for them?



what lived in columbia?

did Woodrow Wilson not cast Woodrow Wilson ballot for john m. palmer , the presidential candidate of the national democratic party , or gold democrats , a short-lived party that supported a gold standard , low tariffs , and limited government ?


what happened in 1917?






is it a disadvantage for something to be unsafe to handle?
was lombardy under napoleon's rule in 1800?
was the italian 10.000 lira banknote created before the euro?

does the united states have a base near glasgow?




what is the si unit measuring magnetic flux density or magnetic induction?


who was the victor of the "war of currents"?








what are carleton university's athletic teams called?



was there a cholera outbreak in 1832?





where were immigrants from in the 1800s?





















is a third  prefix the object  prefix?

are vowels never reduced , regardless of stress ?




what is the irish legend of the children of lir about?

which album was the song "the bonny swans" from?









what is the name of an adult female?





















do terrestrial tortoises have short feet?

where are the only surviving giant tortoises?

are tortoises land based?


are testudines the crown group of the superorder chelonia?


approximately how many species of testudines are alive today?

what was the largest ever chelonian?


are violinists and fiddlers the same thing?










were ancient mallets made of copper?

what is the earl

does the charge reside on the interior of a charged conductor?


when did filippo strozzi sell it to francis i?
when was Michelangelo house demolished?
was michelangelos mother francesca di neri del miniato di siena?
give an example of the most renowned works of the renaissance.









what is the largest primarily french-speaking city in the western world?






how many civilians died in the 1998 u.s. embassy bombing?
how many trades can the nse make per day?

the district is bordered to the southwest by uhuru park and where?
when did the catholic university of eastern africa which obtain its letter of interim authority?
is central park adjacent to uhuru park?
give an example of the highest growth rates of any city in africa.
give an example of the most prominent cities in africa politically and financially.




















consequently, what is not considered a "serious painter" by some contemporary artists, who often regard Norman Rockwell work as bourgeois and kitsch?
who s