In [86]:
import zipfile
import nltk

In [87]:
with zipfile.ZipFile("dialogues.zip", "r") as zd:
    with zd.open('dialogues.txt') as dialogues:
        content = dialogues.read()

In [88]:
blocks = content.decode("utf8").split("\n\n")

In [89]:
blocks[0]

'- Пока, толстуха!\n- Пока, малышка!'

In [90]:
dataset = []
questions = set()

In [91]:
def clear_query(text):
    alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя0123456789- "
    result = ''
    for c in text.lower():
        if c in alphabet:
            result += c
    return result

In [92]:
for block in blocks:
    replicas = block.split("\n")[:2]
    if len(replicas) == 2:
        question = clear_query(replicas[0][2:])
        answer = replicas[1][2:]
        if question and answer and question not in questions:
            questions.add(question)
            dataset.append((question, answer))

In [93]:
print(len(dataset))

604680


In [94]:
dataset[0]

('пока толстуха', 'Пока, малышка!')

In [95]:
search_dataset = {}
for question, answer in dataset:
    words = question.split(" ")
    for word in words:
        if word in search_dataset:
            search_dataset[word].append((question, answer))
        else:
            search_dataset[word] = [(question, answer)]

In [96]:
print(len(search_dataset))

206918


In [97]:
search_dataset = {word: word_dataset 
                  for word, word_dataset 
                  in search_dataset.items() if len(word_dataset) < 5000
                 }

In [136]:
def get_response_generatively(text):
    text = clear_query(text)
    if not text:
        return None
    words = text.split(" ")
    
    words_dataset = set()
    for word in words:
        
        if word in search_dataset:
            words_dataset |= set(search_dataset[word])
            # print(word, len(search_dataset[word]))
            
    scores = []
            
    for question, answer in words_dataset:
        if abs(len(text) - len(question)) / len(question) >= 0.4:
            continue
        distance = nltk.edit_distance(text, question)
        score = distance / len(question)
        if score < 0.4:
            scores.append([score, question, answer])
    if scores:
        return min(scores, key=lambda s: s[0])[2]

In [137]:
get_response_generatively("Привет")

'Привет!'

In [138]:
get_response_generatively("Пока")

'Что насчет нее?'

In [139]:
get_response_generatively("Сколько тебе лет?")

'Какая разница...'

In [140]:
get_response_generatively("Ты мальчик или девочка?")

'Кобыла.'

In [141]:
get_response_generatively("скажи свое имя")

'Луиза.'