In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import random
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import time
from dateutil import parser
from transformers import AutoModel
from transformers import AutoTokenizer
import torch
import pickle
import telebot
from telebot import types
import time

[nltk_data] Downloading package punkt to /home/linux/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_soup(link: str):
    contents = requests.get(link).text
    soup = BeautifulSoup(contents, 'lxml')
    return soup

In [3]:
def get_google_research_publications():
    publications = requests.get(
        "https://research.google/static/data/publications-40ebbba8cb2c591bd21b031bcd13e1d58f8a60e2055cda8775f8600d031f884a.json")
    publications = publications.json()["publications"]
    meta_publications = []
    for publication in publications:
        venue_text = BeautifulSoup(publication["venue_html"], 'lxml').text
        try:
            publication_data = {
                "bibtex": publication["bibtex"],
                "title": publication["title"],
                "venue_text": venue_text,
                "time_release": str(publication["year"]),
                "tag_pks": publication["tag_pks"],
                "abstract": publication["abstract"],
                "link": f"https://research.google/pubs/{publication['filename_html'].split('.')[0]}"
            }
            meta_publications.append(publication_data)
        except Exception as e:
            pass
    return meta_publications

In [4]:
def get_deepmind_research_publications():
    publications = requests.get("https://www.deepmind.com/publications/rss.xml")
    publications = BeautifulSoup(publications.content, 'lxml')
    publications = publications.find_all("item")
    
    meta_publications = []
    for publication in publications:
        publication_data = {
            "title": publication.find("title").text,
            "link": publication.find("guid").text,
            "time_release": publication.find("pubdate").text,
            "description": publication.find("description").text,
        }
        meta_publications.append(publication_data)
    return meta_publications

In [5]:
def get_yandex_research_publications():
    publications = requests.get(
        "https://research.yandex.com/api/publications-filtered?pagination[start]=0&pagination[limit]=1000")
    publications_json = publications.json()["data"]

    meta_publications = []
    for publication in publications_json:
        authors, organizations = [], set()
        for author in publication["authors"]:
            authors.append(author["name"])
            for key in author:
                if key.startswith("captionString"):
                    if author[key]:
                        organizations.add(author[key])

        organizations = Counter(organizations).most_common()
        organizations = [i[0] for i in organizations]
        research_areas = [area["title"] for area in publication["researchAreas"]]

        publication_data = {
            "title": publication["title"],
            "abstract": publication["abstract"],
            "time_release": publication["publicationDate"],
            "slug": publication["slug"],
            "downloadUrl": publication["downloadUrl"],
            "researchAreas": research_areas,
            "authors": authors,
            "organizations": organizations,
            "venue": publication["venue"],
            "link": f"https://research.yandex.com/publications/{publication['slug']}"
        }
        meta_publications.append(publication_data)

    return meta_publications


def get_yandex_research_posts():
    posts = requests.get(
        "https://research.yandex.com/api/posts-filtered?pagination[start]=0&pagination[limit]=1000")
    posts_json = posts.json()["data"]

    meta_posts = []
    for publication in posts_json:
        research_areas = [area["title"] for area in publication["researchAreas"]]

        publication_data = {
            "title": publication["title"],
            "time_release": publication["publicationDate"],
            "slug": publication["slug"],
            "researchAreas": research_areas,
            "type": publication["type"],
            "seoDescription": publication["seoDescription"],
            "link": f"https://research.yandex.com/blog/{publication['slug']}"
        }
        meta_posts.append(publication_data)

    return meta_posts


In [6]:
# ------------- huggingface

def get_article_link_hf(soup):
    prettify_html = soup.prettify()
    start, stop = re.search(r'href=".*"', prettify_html).span()
    return f"https://huggingface.co{prettify_html[start:stop][6:-1]}"


def soup_to_embed_hf(soup):
    results = []
    for element in soup.find("div", {"class": "grid grid-cols-1 gap-12 pt-12 lg:grid-cols-2"}).find_all("a"):
        article_link = get_article_link_hf(element)
        try:
            temp = {"time_release": element.find_all("span")[-1].text,
                    "title": element.h2.text.strip(),
                    "article_link": article_link,
                    "images_links": [f'https://huggingface.co{element.img["src"]}']}
            results.append(temp)
        except: pass
    return results

# ------------- google ai

def get_images_links_google_ai(soup):
    results = []
    for i in soup.find_all("a"):
        try:
            if "https://blogger.googleusercontent.com/img" in i["href"]:
                if i["href"].rsplit(".", 1)[1] in ["jpg", "png"]:
                    results.append(i["href"])
        except: pass
    return results


def get_description_google_ai(soup) -> str:
    elements = [i.text.strip() for i in soup.find_all("p")]
    element = sorted(elements, key=lambda x: len(x))
    text = sent_tokenize(element[-2], language="english")[:2]
    return "\n".join(text)


def soup_to_embed_google_ai(soup):
    results = []
    for element in soup.find_all("div", {"class": "post"}):
        link = element.a["href"]
        element_soup = get_soup(link)
        temp = {"time_release": element.span.text.strip(),
                "title": element.a.text.strip(),
                "description": get_description_google_ai(element_soup),
                "images_links": get_images_links_google_ai(element_soup),
                "article_link": link,
                "labels": [j.text.strip() for j in element.find_all("a", {"class": "label"})]}
        results.append(temp)
    return results

# ------------- pytorch

def soup_to_embed_pytorch(soup):
    results = []
    for element in soup.find("div", {"class": "row blog-vertical"}).find_all("div", {"class": "col-md-4"}):
        temp = {"time_release": element.p.text,
                "title": element.a.text,
                "description": element.find_all("p")[1].text.strip(),
                "article_link": f'https://pytorch.org{element.a["href"]}'}
        results.append(temp)
    return results

# ------------- open ai

def get_text_page_open_ai(link):
    contents = requests.get(link).text
    soup = BeautifulSoup(contents, 'lxml')
    return [i.text for i in soup.find_all("p") if i.text]


def soup_to_embed_open_ai(soup):
    results = []
    for element in soup.find_all("div", {"class": "post-card-full medium-xsmall-copy"}):
        try:
            article_link = f'https://openai.com{element.a["href"]}'
            temp = {"time_release": element.time.text,
                    "title": element.h5.text,
                    "description": get_text_page_open_ai(article_link)[0],
                    "article_link": article_link,
                    "tag": element.ul.a.text if element.ul else None}
            results.append(temp)
        except: pass
    return results

# ------------- nvidia-research

def get_nvidia_research_publications(soup):
    data = []
    for element in soup.find_all("div", {"class": "views-row"}):
        try:
            info = {
                "link": f"https://research.nvidia.com{element.a['href']}",
                "authors": element.find("div", {"class": "field-content"}).text,
                "published-in-link": element.find("div", {"class": "views-field views-field-field-published-in"}).a["href"],
                "published-in-conf": element.find("div", {"class": "views-field views-field-field-published-in"}).a.text,
                "time_release": element.a['href'].split("_")[0].split("/")[-1] + "-01",
                "title": element.a.text
            }
            data.append(info)
        except: pass
    return data

In [7]:
def prepare_data(data):
    idx = 0
    for _ in range(len(data)):
        try:
            parser.parse(data[idx]["time_release"])
            data[idx]["time_release_new"] = parser.parse(data[idx]["time_release"])
            data[idx]["time_release_new"] = time.mktime(data[idx]["time_release_new"].timetuple())
            idx += 1
        except:
            del data[idx]
    data = sorted(data, key=lambda x: x["time_release_new"], reverse=True)
    return data


def get_blog_posts():
    yandex_research_posts = get_yandex_research_posts()
    yandex_research_posts = prepare_data(yandex_research_posts)

    openai_posts_soup = get_soup("https://openai.com/blog/")
    openai_posts = soup_to_embed_open_ai(openai_posts_soup)
    openai_posts = prepare_data(openai_posts)

    hf_posts_soup = get_soup("https://huggingface.co/blog/")
    hf_posts = soup_to_embed_hf(hf_posts_soup)
    hf_posts = prepare_data(hf_posts)

    pytorch_posts_soup = get_soup("https://pytorch.org/blog/")
    pytorch_posts = soup_to_embed_pytorch(pytorch_posts_soup)
    pytorch_posts = prepare_data(pytorch_posts)

    return yandex_research_posts, openai_posts, hf_posts, pytorch_posts


def get_research_posts():
    nvidia_research_posts_soup = get_soup("https://research.nvidia.com/research-area/machine-learning-artificial-intelligence")
    nvidia_research_posts = get_nvidia_research_publications(nvidia_research_posts_soup)
    nvidia_research_posts = prepare_data(nvidia_research_posts)

    google_ai_posts_soup = get_soup("https://responsible-ai-developers.googleblog.com/")
    google_ai_posts = soup_to_embed_google_ai(google_ai_posts_soup)
    google_ai_posts = prepare_data(google_ai_posts)

    yandex_research_posts = get_yandex_research_posts()
    yandex_research_posts = prepare_data(yandex_research_posts)

    deepmind_research_publications = get_deepmind_research_publications()
    deepmind_research_publications = prepare_data(deepmind_research_publications)

    google_research_publications = get_google_research_publications()
    google_research_publications = prepare_data(google_research_publications)

    return nvidia_research_posts, google_ai_posts, yandex_research_posts, deepmind_research_publications, google_research_publications

In [8]:
def make_nvidia_post(nvidia_research_posts):
    nvidia_research_posts = nvidia_research_posts[:5]
    text = "*Latest NVIDIA Research publications*:\n\n"
    for post in nvidia_research_posts:
        paragraph = [f"*{post['title']}*",
                     f"Сonference/Venue: {post['published-in-conf']}",
                     f"[Published URL]({post['published-in-link']}) | [Read now]({post['link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_google_ai_post(google_ai_research_posts):
    google_ai_research_posts = google_ai_research_posts[:3]
    text = "*Latest Google AI Research publications*:\n\n"
    for post in google_ai_research_posts:
        paragraph = [f"*{post['title']}*",
                     post["description"],
                     " | ".join(post["labels"]),
                     f"[Read now]({post['article_link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_yandex_research_post(yandex_research_research_posts):
    yandex_research_research_posts = [i for i in yandex_research_research_posts if i["type"] != "announcement"][:3]
    text = "*Latest Yandex Research publications*:\n\n"
    for post in yandex_research_research_posts:
        paragraph = [f"*{post['title']}*",
                     post["seoDescription"],
                     post["type"] + " " + " | ".join(post["researchAreas"]),
                     f"[Read now]({post['link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_deepmind_research_post(deepmind_research_research_posts):
    deepmind_research_research_posts = deepmind_research_research_posts[:3]
    text = "*Latest DeepMind Research publications*:\n\n"
    for post in deepmind_research_research_posts:
        paragraph = [f"*{post['title']}*",
                     post["description"],
                     f"[Read now]({post['link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_google_research_post(google_research_research_posts):
    google_research_research_posts = google_research_research_posts[:3]
    text = "*Latest Google Research publications*:\n\n"
    for post in google_research_research_posts:
        paragraph = [f"*{post['title']}*",
                     post["abstract"],
                     f"Сonference/Venue: {post['venue_text']}",
                     " | ".join(post["tag_pks"]),
                     f"[Read now]({post['link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()

In [9]:
def make_yandex_research_blog_post(yandex_research_research_blog):
    yandex_research_research_blog = [i for i in yandex_research_research_blog if i["type"] == "announcement"][:3]
    text = "*Latest Yandex Research Blog*:\n\n"
    for post in yandex_research_research_blog:
        paragraph = [f"*{post['title']}*",
                     post["seoDescription"],
                     post["type"] + " " + " | ".join(post["researchAreas"]),
                     f"[Read now]({post['link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_openai_blog_post(openai_research_blog):
    openai_research_blog = openai_research_blog[:3]
    text = "*Latest Open AI Research Blog*:\n\n"
    for post in openai_research_blog:
        paragraph = [f"*{post['title']}*",
                     post["description"],
                     post["tag"],
                     f"[Read now]({post['article_link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_hf_blog_post(hf_research_blog):
    hf_research_blog = hf_research_blog[:5]
    text = "*Latest HF Blog*:\n\n"
    for post in hf_research_blog:
        paragraph = [f"*{post['title']}*",
                     f"[Read now]({post['article_link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()


def make_pytorch_blog_post(pytorch_research_blog):
    pytorch_research_blog = pytorch_research_blog[:5]
    text = "*Latest PyTorch Blog*:\n\n"
    for post in pytorch_research_blog:
        paragraph = [f"*{post['title']}*",
                     post["description"],
                     f"[Read now]({post['article_link']})"]
        text += "\n".join(paragraph) + "\n\n"
    return text.rstrip()

In [10]:
def get_embedding(text: str):
    tokens = tokenizer(text, truncation=True, return_tensors='pt')
    with torch.no_grad():
        emb = model(**tokens).pooler_output
    return emb

In [11]:
def make_search_answer(objects):
    objects = objects[:5]
    text = "*Sematic Search Results*:\n\n"
    for post in objects:
        paragraph = f"[{post['title']}]"
        paragraph += f"({post['article_link']})" if "article_link" in post else f"({post['link']})"
        text += paragraph + "\n\n"
    return text.rstrip()

In [12]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = AutoModel.from_pretrained("prajjwal1/bert-tiny")
model = model.eval()

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
blog_posts = get_blog_posts()
research_posts = get_research_posts()

  venue_text = BeautifulSoup(publication["venue_html"], 'lxml').text


In [14]:
with open("blog_posts.pkl", "wb") as f:
    pickle.dump(blog_posts, f)
    
with open("research_posts.pkl", "wb") as f:
    pickle.dump(research_posts, f)

In [15]:
dataset = blog_posts + research_posts
dataset = [j for i in dataset for j in i]
dataset = [(i, get_embedding(i["title"])) for i in dataset]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
bot = telebot.TeleBot("token")

blogs_dict = {
    "Yandex Research Blog": make_yandex_research_blog_post(blog_posts[0]),
    "Open AI Blog": make_openai_blog_post(blog_posts[1]),
    "Hugging Face Blog": make_hf_blog_post(blog_posts[2]),
    "PyTorch Blog": make_pytorch_blog_post(blog_posts[3])
}

publication_dict = {
    "NVIDIA AI Publications": make_nvidia_post(research_posts[0]),
    "Google AI Publications": make_google_ai_post(research_posts[1]),
    "Yandex Research Publications": make_yandex_research_post(research_posts[2]),
    "DeepMind Research Publications": make_deepmind_research_post(research_posts[3]),
    "Google Research Publications": make_google_research_post(research_posts[4])
}


@bot.message_handler(commands=['start'])
def start(message) -> None:
    UsrInfo = bot.get_chat_member(message.from_user.id, message.from_user.id).user
    
    markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
    button_create = types.KeyboardButton("👋 Собрать свежую информацию")
    markup.add(button_create)
    
    bot.send_message(message.chat.id, f'💻 Привет, {UsrInfo.first_name}! Я *бот-агрегатор* профильных материалов в *Computer Science* & *Data Science*. '
                                      'Могу актуализировать новые публикации и готовящиеся анонсы мероприятий 🚀\n\n'
                                      'Давай проверим мой функционал 🤟',
                     reply_markup=markup, parse_mode='Markdown')
    

@bot.message_handler(commands=['search'])
def search(message) -> None:
    message.text = message.text[9:]
    if message.text:
        emb = get_embedding(message.text)
        response = [(i[0], torch.nn.functional.cosine_similarity(i[1], emb).cpu().detach().numpy()[0]) for i in dataset]
        results = sorted(response, key=lambda x: x[1], reverse=True)
        results = [i[0] for i in results]
        answer = make_search_answer(results)
        print(answer)

        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btn = types.KeyboardButton("Назад")
        markup.add(btn)

        bot.send_message(message.chat.id, answer, reply_markup=markup, parse_mode='Markdown')
    else:
        bot.send_message(message.chat.id, "Запрос пустой")
        message.text = "Поиск"
        func(message)

@bot.message_handler(content_types=['text'])
def func(message):
    if message.text == "👋 Собрать свежую информацию":
        bot.send_message(message.chat.id, "Собираю информацию. Более 7 тысяч материалов...|")
        time.sleep(0.1)
        bot.send_message(message.chat.id, "Привожу в читабельный вид...")
        time.sleep(0.1)
        
        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btn1 = types.KeyboardButton("Блоги")
        btn2 = types.KeyboardButton("Публикации")
        btn3 = types.KeyboardButton("Поиск")
        markup.add(btn1, btn2, btn3)
        
        bot.send_message(message.chat.id, "Готово! Нужно выбрать категорию\n"
                                          "*Блоги* - анонсы мероприятий, полезные статьи\n"
                                          "*Публикации* - публикации в журналах и на конференциях\n"
                                          "*Поиск* - по контексту ищем совпадение в базе",
                         parse_mode='Markdown', reply_markup=markup)
    
    # =======
    elif message.text == "Блоги":
        bot.send_message(message.chat.id, "Супер 🔥")
        
        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btns = [types.KeyboardButton(i) for i in blogs_dict.keys()]
        btns += [types.KeyboardButton("Назад")]
        markup.add(*btns)
        
        bot.send_message(message.chat.id, "Вот, что мне удалось собрать:", reply_markup=markup)
        
    elif message.text.endswith("Blog"):
        try:
            bot.send_message(message.chat.id, blogs_dict[message.text], parse_mode='Markdown')
        except:
            bot.send_message(message.chat.id, blogs_dict[message.text])
    
    # =======
    elif message.text == "Публикации":
        bot.send_message(message.chat.id, "Супер 🔥")
        
        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btns = [types.KeyboardButton(i) for i in publication_dict.keys()]
        btns += [types.KeyboardButton("Назад")]
        markup.add(*btns)
        
        bot.send_message(message.chat.id, "Вот, что мне удалось собрать:", reply_markup=markup)
        
    elif message.text.endswith("Publications"):
        try:
            bot.send_message(message.chat.id, publication_dict[message.text], parse_mode='Markdown')
        except:
            bot.send_message(message.chat.id, publication_dict[message.text])
            
     # =======
    
    elif message.text == "Назад":
        message.text = "👋 Собрать свежую информацию"
        func(message)
        
    # -------
    
    elif message.text == "Поиск":
        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btn = types.KeyboardButton("/search transformers in text")
        markup.add(btn)
        
        bot.send_message(message.chat.id, "Воспользуйся командой для поиска: /search {текст сюда}", reply_markup=markup)
    
    else:
        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btn = types.KeyboardButton("👋 Собрать свежую информацию")
        markup.add(btn)
        
        bot.send_message(message.chat.id, "Я тебя не понял, поэтому давай начнем сначала", reply_markup=markup)
        
    
bot.polling(none_stop=True)