In [12]:
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import string
import annoy
from gensim.models import Word2Vec, FastText

import os
from telegram.ext  import Updater, CommandHandler, MessageHandler, Filters, CallbackContext
from telegram import Update

import re

import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### Создаем режим болталки

In [14]:
# Создаем функции препроцессинга текста

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_short_words(text):
    return ' '.join([w for w in text.split() if len(w)>2])

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www.\S+', '', text)

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_non_alpha(text):
    return re.sub(r'\W+', ' ', text)

def remove_repeated_characters(text):
    return re.sub(r'(.)\1+', r'\1', text)

def preprocess_txt(line):
    line = remove_numbers(line)
    line = remove_short_words(line)
    line = remove_html_tags(line)
    line = remove_urls(line)
    line = remove_extra_spaces(line)
    line = remove_emoji(line)
    line = remove_non_alpha(line)
    line = remove_repeated_characters(line)

    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]

    return spls

Процесс обработки материалов

In [15]:
# assert False

#Small preprocess of the answers

question = None
written = False

with open("../Less_3_word2vec/prepared_answers.txt", "w") as fout:
    with open("../Less_3_word2vec/Otvety.txt", "r") as fin:
        for line in tqdm(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

0it [00:00, ?it/s]

In [16]:
assert True

# Preprocess for models fitting

sentences = []

c = 0

with open("../Less_3_word2vec/Otvety.txt", "r") as fin:
    for line in tqdm_notebook(fin):
        spls = preprocess_txt(line)
        sentences.append(spls)
        c += 1
        if c > 500000:
            break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(fin):


0it [00:00, ?it/s]

In [17]:
sentences = [i for i in sentences if len(i) > 2]
modelFT = FastText(sentences=sentences, size=100, min_count=1, window=5)
modelFT.save("ft_model")

In [18]:
modelFT = FastText.load("ft_model")
ft_index = annoy.AnnoyIndex(100 ,'angular')

index_map = {}
counter = 0

with open("../Less_3_word2vec/prepared_answers.txt", "r") as f:
    for line in tqdm_notebook(f):
        n_ft = 0
        spls = line.split("\t")
        index_map[counter] = spls[1]
        question = preprocess_txt(spls[0])
        vector_ft = np.zeros(100)
        for word in question:
            if word in modelFT.wv:
                vector_ft += modelFT.wv[word]
                n_ft += 1
        if n_ft > 0:
            vector_ft = vector_ft / n_ft
        ft_index.add_item(counter, vector_ft)
            
        counter += 1

        if counter > 500000:
            break

ft_index.build(10)
ft_index.save('speaker.ann')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(f):


0it [00:00, ?it/s]

True

In [19]:
ft_index = annoy.AnnoyIndex(100, 'angular')
ft_index.load('speaker.ann') 

True

In [20]:
ft_index.get_nns_by_vector(np.zeros(100), 2)

[503, 3359]

In [21]:
def embed_txt(txt, idfs, midf):
    n_ft = 0
    vector_ft = np.zeros(100)
    for word in txt:
        if word in modelFT.wv:
            vector_ft += modelFT.wv[word] * 1 # idfs.get(word, midf)
            n_ft += 1 # idfs.get(word, midf)
    return vector_ft / n_ft

### Создаем режим перевода текста с английского на русский

In [22]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "Helsinki-NLP/opus-mt-en-ru"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Downloading pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [23]:
def translate_english_to_russian(text):
    # Encode the text
    inputs = tokenizer.encode(text, return_tensors="pt")

    # Generate the translated text
    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)

    # Decode the outputs into a readable string
    translated_text = tokenizer.decode(outputs[0])

    return translated_text

### Создаем режим прогноза погоды

In [24]:
# функция для запроса прогноза погоды

import requests 
import json

key = '9ead4b09f8ab46fb9ce105955220407' # Токен https://www.weatherapi.com/

def get_weather_forecast(city):
    url = f'http://api.weatherapi.com/v1/current.json?key={key}&q={city}&lang=ru'
    requests.get(url)
    response = requests.get(url)
    json_data = json.loads(response.text)

    city = json_data['location']['name']
    time = json_data['location']['localtime'].split(' ')[1]
    temp = json_data['current']['temp_c']
    temp_fl = json_data['current']['feelslike_c']
    condition = json_data['current']['condition']['text']
    wind = json_data['current']['wind_mph']
    uv = json_data['current']['uv']

    msg = f'В городе {city} сейчас {time}. О погоде: {condition}, температура {temp} градусов, ощущается как {temp_fl} градусов, скорость ветра {wind} м/ч, уровень ультрафиолета {uv}'

    return msg

In [31]:
# !python -m spacy download ru_core_news_sm

In [30]:
# Определение города

import spacy

nlp = spacy.load("ru_core_news_sm")

def recognize_location(text): 
    locs = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'LOC':
            locs.append(ent.text)
    locs = [morpher.parse(i.lower())[0].normal_form for i in locs]
    return locs

In [33]:
# Погода в запрашиваемом городе

def forecast(text):
    city = recognize_location(text)
    try:
        forecast = get_weather_forecast(city)
    except:
        forecast = "Не понятен запрос"
    
    return forecast

### Процедура определения типа запроса 

In [34]:
# Проверяем язык текста

import re 

def english_check(text):
    return bool(re.search(r'[a-zA-Z]', text))

In [35]:
import re

# Регулярное выражение для определения запроса о погоде
weather_pattern = re.compile(r'\b(погод[ауеы]|солнечн[аоеымй]|осадк[аиуеы]|дожд[аиуеымй]|снег[ауеым]|ливн[аеуиы]|зонт[ауеым])\b', re.IGNORECASE)

# Регулярное выражение для определения запроса на перевод
translation_pattern = re.compile(r'\b(перевод[ауеымч]|английск[ауеымий])\b', re.IGNORECASE)

In [44]:
# Функция для проверки есть ли совпадение со списком ключевых слов

def pattern_search(text, pattern):
    for word in text:
        if pattern.search(str(text)):
            return True
            break
    return False

### Работа самого бота

In [48]:
updater = Updater(token='YOUR_TOKEN') # Токен API к Telegram
dispatcher = updater.dispatcher

def startCommand(update: Update, context: CallbackContext):
    update.message.reply_text('Доброго времени суток!')
# Определеляем тип запроса пользования и ответа на него

def textMessage(update: Update, context: CallbackContext):
    input_txt = preprocess_txt(update.message.text)
    
    # Перевод 
    if english_check(update.message.text) == True:
        update.message.reply_text(translate_english_to_russian(update.message.text))
    elif pattern_search(input_txt, translation_pattern) == True:
        update.message.reply_text("Какое предложение вам перевести?") 
        
    # Прогноз погоды    
    elif pattern_search(input_txt, weather_pattern) == True:        
        update.message.reply_text(forecast(update.message.text))
            
    # Болталка
    else: 
        vect_ft = embed_txt(input_txt, {}, 1)
        ft_index_val, distances = ft_index.get_nns_by_vector(vect_ft, 1, include_distances=True)
        if distances[0] > 0.35:
            print(distances[0])
            update.message.reply_text("Не понимаю тебя")
        else:
            update.message.reply_text(index_map[ft_index_val[0]])
            
    return


start_command_handler = CommandHandler('start', startCommand)
text_message_handler = MessageHandler(Filters.text, textMessage)
dispatcher.add_handler(start_command_handler)
dispatcher.add_handler(text_message_handler)
updater.start_polling(clean=True)
updater.idle()