In [1]:
import nltk
import zipfile
import random
import logging
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from final_config import FINAL_CONFIG
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
BOT_CONFIG = FINAL_CONFIG

In [2]:
dataset = []

for intent, intent_data in BOT_CONFIG["intents"].items():
    for example in intent_data["examples"]:
        dataset.append((example, intent))
        
corpus = [text for text, intent in dataset]
y = [intent for text, intent in dataset]

In [3]:
vectorizer = CountVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(corpus)

In [4]:
clf = SVC(probability=True)
clf.fit(X, y)

SVC(probability=True)

In [5]:
clf.score(X, y)

0.6032608695652174

In [6]:
def get_intent(text):
    proba_list = clf.predict_proba(vectorizer.transform([text]))[0]
    max_proba = max(proba_list)
    print(text, clf.predict(vectorizer.transform([text])), max_proba)
    if max_proba > 0.1:
        index = proba_list.argmax()
        return clf.classes_[index]

In [7]:
def get_response_by_intent(intent):
    return random.choice(BOT_CONFIG["intents"][intent]["responses"])    

In [8]:
with zipfile.ZipFile("dialogues.zip", "r") as zd:
    with zd.open('dialogues.txt') as dialogues:
        content = dialogues.read()

In [9]:
blocks = content.decode("utf8").split("\n\n")

In [10]:
dataset = []
questions = set()

In [11]:
def clear_query(text):
    alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя0123456789- "
    result = ''
    for c in text.lower():
        if c in alphabet:
            result += c
    return result

In [12]:
for block in blocks:
    replicas = block.split("\n")[:2]
    if len(replicas) == 2:
        question = clear_query(replicas[0][2:])
        answer = replicas[1][2:]
        if question and answer and question not in questions:
            questions.add(question)
            dataset.append((question, answer))

In [13]:
search_dataset = {}
for question, answer in dataset:
    words = question.split(" ")
    for word in words:
        if word in search_dataset:
            search_dataset[word].append((question, answer))
        else:
            search_dataset[word] = [(question, answer)]

In [14]:
search_dataset = {word: word_dataset 
                  for word, word_dataset 
                  in search_dataset.items() if len(word_dataset) < 5000
                 }

In [15]:
def get_response_generatively(text):
    text = clear_query(text)
    if not text:
        return None
    words = text.split(" ")
    
    words_dataset = set()
    for word in words:
        
        if word in search_dataset:
            words_dataset |= set(search_dataset[word])
            # print(word, len(search_dataset[word]))
            
    scores = []
            
    for question, answer in words_dataset:
        if abs(len(text) - len(question)) / len(question) >= 0.4:
            continue
        distance = nltk.edit_distance(text, question)
        score = distance / len(question)
        if score < 0.4:
            scores.append([score, question, answer])
    if scores:
        return min(scores, key=lambda s: s[0])[2]

In [16]:
def get_failure_phrase():
    return random.choice(BOT_CONFIG["failure_phrases"])

In [17]:
get_response_generatively("привет")

'Привет!'

In [18]:
stats = {"intent": 0, "generative": 0, "fails": 0}

In [19]:
def bot(request: str):
    # NLU
    intent = get_intent(request)
    
    # Generate answer
    if intent:
        stats["intent"] += 1
        return get_response_by_intent(intent)
    
    response = get_response_generatively(request)
    if response:
        stats["generative"] += 1
        return response
        
    stats["fails"] += 1
    return get_failure_phrase()

In [20]:
bot("сколько тебе лет")

сколько тебе лет ['country'] 0.047191427126573766


'Какая разница...'

In [21]:
stats

{'intent': 0, 'generative': 1, 'fails': 0}

In [22]:
bot("ты кто")

ты кто ['programm'] 0.10655026265008989


'Замечательно, а как ты?'

In [23]:
bot("как вас зовут")

как вас зовут ['emotion'] 0.048279541399502295


'Сестра Тереза.'

In [24]:
def start(update, context):
    """Send a message when the command /start is issued."""
    update.message.reply_text('Hi!')


def help_command(update, context):
    """Send a message when the command /help is issued."""
    update.message.reply_text('Help!')


def echo(update, context):
    """Echo the user message."""
    answer = bot(update.message.text)
    update.message.reply_text(answer)
    print()
    print(stats)


def main():
    updater = Updater("1221938593:AAHLhn-M7YmwrqmAb0CM53gXsBjgWa9VxC8", use_context=True)

    dp = updater.dispatcher


    dp.add_handler(CommandHandler("start", start))
    dp.add_handler(CommandHandler("help", help_command))


    dp.add_handler(MessageHandler(Filters.text & ~Filters.command, echo))

    updater.start_polling()

    updater.idle()

In [25]:
main()

Hi ['hello'] 0.23755943985979053

{'intent': 2, 'generative': 2, 'fails': 0}
Так ['country'] 0.11690790038052518

{'intent': 3, 'generative': 2, 'fails': 0}
123 ['hello'] 0.174130692069454

{'intent': 4, 'generative': 2, 'fails': 0}
Ты пидор ['emotion'] 0.0819318676050925

{'intent': 4, 'generative': 2, 'fails': 1}
Ты пидор ['emotion'] 0.0819318676050925

{'intent': 4, 'generative': 2, 'fails': 2}
Ты пидор ['emotion'] 0.0819318676050925

{'intent': 4, 'generative': 2, 'fails': 3}
Ты пидор ['emotion'] 0.0819318676050925

{'intent': 4, 'generative': 2, 'fails': 4}
Скачать читы кс го ['country'] 0.09662825901592761

{'intent': 4, 'generative': 2, 'fails': 5}
Что такое баунтичитс ['country'] 0.05978427942997066

{'intent': 4, 'generative': 3, 'fails': 5}
Как наваливать на каршеринге ['emotion'] 0.0759599413429209

{'intent': 4, 'generative': 3, 'fails': 6}
Где купить некруху ['country'] 0.06455189988187737

{'intent': 4, 'generative': 3, 'fails': 7}
Хочу бмв ['country'] 0.3536356642462521
