In [2]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
mystem = Mystem()
model_ft = gensim.models.KeyedVectors.load('../vectorizers/araneum_none_fasttextcbow_300_5_2018.model')
bert_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
bert_model = AutoModel.from_pretrained("cointegrated/rubert-tiny")

def vec_normalization(vec):
    return vec / np.linalg.norm(vec)

def make_ft_embedding(line:str):
    emb_list = []
    for word in line.split():
        emb_list .append(model_ft[word])
    emb_list = vec_normalization(np.array(emb_list))
    return emb_list.mean(axis=0)


def preprocess_line(text: str) -> str:
    # убираем пунктуацию, оставляем только дефисы
    no_punct = re.sub('[,\?\!\."\:]|[a-zA-Z]+', '', ''.join(text))
    # токенизируем и лемматизируем текст, приводим к нижнему регистру
    lem_words = mystem.lemmatize(text.lower())
    ans = ' '.join([w for w in lem_words if w.isalpha()])
    if ans == " ":
        return 'пустой текст'
    elif ans == '':
        return 'пустой текст'
    else:
        return re.sub('\n', '', ans)

def make_df_from_corpus(path_to_json:str):
    with open(path_to_json, 'r') as f:
        qa_corpus = list(f)[:11000]
    questions = []
    answers = []
    for qa in qa_corpus:
      qa = json.loads(qa)
      if qa['answers'] != []:
        max_value = -10 ** 6
        max_text = ''
        for answer in qa['answers']:
          if answer['author_rating']['value'] != '':
            cur_value = int(answer['author_rating']['value'])
            if cur_value >= max_value:
              max_text = answer['text']
        if max_text != '':
          answers.append(max_text)
          questions.append(qa['question'])
    df = pd.DataFrame({'questions': questions, 'answers': answers})
    df['ans_lemmas'] = df['answers'].apply(preprocess_line)
    return df

def save_embed_corpus(df):
    df['ans_embeds'] = df['ans_lemmas'].apply(make_ft_embedding)
    ans_embeds = df['ans_lemmas'].apply(make_ft_embedding).to_numpy()
    split_df = pd.DataFrame(df['ans_embeds'].tolist(),
                        columns=[f'word{i}' for i in range(300)])
    split_df['doc_name'] = df['answers']
    split_df.to_csv('../corpora/corpus_ft.csv')

        
def save_bert_corpus(texts, model, tokenizer):
    vectors = []
    for text in texts:
      t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
      with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
      embeddings = model_output.last_hidden_state[:, 0, :]
      embeddings = torch.nn.functional.normalize(embeddings)
      vectors.append(embeddings[0].cpu().numpy())
    BERT_corpus = sparse.csr_matrix(vectors)
    sparse.save_npz('../corpora/BERT.npz', BERT_corpus)
    return sparse.csr_matrix(vectors)


def get_query_bert(query):
    cls_embeddings = get_bert_corpus(query, b_model, b_tokenizer)
    return sparse.csr_matrix(cls_embeddings)



if __name__ == "__main__":
    df = make_df_from_corpus(path_to_json='../data/questions_about_love.jsonl')
    df = df.head(10000)
    save_embed_corpus(df)
    save_bert_corpus(df['answers'], bert_model, bert_tokenizer)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pikachu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from nltk.corpus import stopwords
import pickle
import scipy
from scipy import sparse
import torch
from transformers import AutoTokenizer, AutoModel
mystem = Mystem()
model_ft = gensim.models.KeyedVectors.load('../vectorizers/araneum_none_fasttextcbow_300_5_2018.model')
bert_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
bert_model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
df = pd.read_csv('../data/questions_about_love.csv')

def vec_normalization(vec):
    return vec / np.linalg.norm(vec)

def make_ft_embedding(line:str):
    emb_list = []
    for word in line.split():
        emb_list .append(model_ft[word])
    emb_list = vec_normalization(np.array(emb_list))
    return emb_list.mean(axis=0)


def preprocess_line(text: str) -> str:
    # убираем пунктуацию, оставляем только дефисы
    no_punct = re.sub('[,\?\!\."\:]|[a-zA-Z]+', '', ''.join(text))
    # токенизируем и лемматизируем текст, приводим к нижнему регистру
    lem_words = mystem.lemmatize(text.lower())
    ans = ' '.join([w for w in lem_words if w.isalpha()])
    if ans == " ":
        return 'пустой текст'
    elif ans == '':
        return 'пустой текст'
    else:
        return re.sub('\n', '', ans)


def cs_FastText(query, corpus_ft, cols):
    query = preprocess_line(query)
    scores = np.dot(corpus_ft[cols].values, make_ft_embedding(query).T)
    argx = np.argsort(scores)[::-1]
    return corpus_ft['doc_name'][argx.ravel()]


def cosine_similarity_matrix_query(sparse_matrix, query):
    return np.dot(sparse_matrix, query.T).toarray()


def make_bert_embedding(query):
    embeddings = embed_bert_cls(query, bert_model, bert_tokenizer)
    return sparse.csr_matrix(embeddings)


def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].numpy()

def cs_BERT(query, sparse_matrix):
    query = make_bert_embedding(query)
    scores = cosine_similarity_matrix_query(sparse_matrix, query)
    argx = np.argsort(scores, axis=0)[::-1]
    return df['answers'][argx.ravel()]


def main(query:str, top_n:int):
    path_to_corpus = '../corpora/corpus_ft.csv'
    corpus_ft = pd.read_csv(path_to_corpus)
    cols = [col for col in corpus_ft.columns if 'word' in col]
    tf_answers = cs_FastText(query, corpus_ft, cols)[:top_n].to_numpy()
    sparse_matrix = sparse.load_npz('../corpora/BERT.npz')
    bert_answers = cs_BERT(query, sparse_matrix)[:top_n].to_numpy()
    return tf_answers, bert_answers

if __name__ == "__main__":
    query = input('Введите фразу, которую хотите найти в корпусе: ')
    top_n = int(input('Напишите число n, топ-n результатов выдачи моделей вы хотите видеть: '))
    tf_answers, bert_answers = main(query, top_n)
    tf_answers = ',\n'.join(tf_answers)
    bert_answers = ',\n'.join(bert_answers)
    print(f'Вот топ-{top_n} ответов, найденных с помощью FastText: \n', tf_answers, '\n\n')
    print(f'Вот топ-{top_n} ответов, найденных с помощью BERT: \n', bert_answers)

Введите фразу, которую хотите найти в корпусе:  гей
Напишите число n, топ-n результатов выдачи моделей вы хотите видеть:  10


Вот топ-10 ответов, найденных с помощью FastText: 
 Ты гей?,
Гопника,
нигера),
мачо,
Шлюхи!,
маньяк,
Мусульманка,
Блонда,
социопат,
Может он гей??! 


Вот топ-10 ответов, найденных с помощью BERT: 
 фу,
хз,
хз,
М,
Ж,
мачо,
ум,
лес,
м,
ага


In [26]:
tf_answers, bert_answers = main('не любит', 5)

In [27]:
tf_answers

array(['Любят.', 'нравишься', 'нравишься', 'Нравишься.', 'Нравишься'],
      dtype=object)

In [28]:
bert_answers

array(['Не любит.', 'не с любыми', 'не дружелюбие',
       'не уважает и не любит', 'такое не бывает'], dtype=object)

In [18]:
# import json
# import gensim
# from gensim.models.wrappers import FastText
# import numpy as np
# import pandas as pd
# import re
# import nltk
# from nltk.tokenize import word_tokenize
# from pymystem3 import Mystem
# from nltk.corpus import stopwords
# from sklearn.metrics.pairwise import cosine_similarity
import pickle
import scipy
from scipy import sparse
import torch
from transformers import AutoTokenizer, AutoModel

# nltk.download("stopwords")
# russian_stopwords = stopwords.words("russian")
# mystem = Mystem()

bert_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
bert_model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
# model_ft = gensim.models.KeyedVectors.load('../vectorizers/araneum_none_fasttextcbow_300_5_2018.model')
# df = pd.read_csv('../data/questions_about_love.csv')


def cosine_similarity_matrix_query(sparse_matrix, query):
    return np.dot(sparse_matrix, query.T).toarray()


def make_bert_embedding(query):
    embeddings = embed_bert_cls(query, bert_model, bert_tokenizer)
    return sparse.csr_matrix(embeddings)


def cs_BERT(query, sparse_matrix):
    query = make_bert_embedding(query)
    scores = cosine_similarity_matrix_query(sparse_matrix, query)
    argx = np.argsort(scores, axis=0)[::-1]
    return df['answers'][argx.ravel()]

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].numpy()



Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
