In [11]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
mystem = Mystem()
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')


def make_ft_embedding(line:str, model_ft):
    emb_list = []
    for word in line.split():
        emb_list .append(model_ft[word])
    emb_list = np.array(emb_list)
    return emb_list.mean(axis=0)


def preprocess_line(text: str) -> str:
    # убираем пунктуацию, оставляем только дефисы
    no_punct = re.sub('[,\?\!\."\:]|[a-zA-Z]+', '', ''.join(text))
    # токенизируем и лемматизируем текст, приводим к нижнему регистру
    lem_words = mystem.lemmatize(text.lower())
    ans = ' '.join([w for w in lem_words if w not in russian_stopwords and w.isalpha()])
    return re.sub('\n', '', ans)

def make_df_from_corpus(path_to_json:str):
    with open(path_to_json, 'r') as f:
        qa_corpus = list(f)[:50]
    questions = []
    answers = []
    for qa in qa_corpus:
      qa = json.loads(qa)
      if qa['answers'] != []:
        max_value = -10 ** 6
        max_text = ''
        for answer in qa['answers']:
          if answer['author_rating']['value'] != '':
            cur_value = int(answer['author_rating']['value'])
            if cur_value >= max_value:
              max_text = answer['text']
        if max_text != '':
          answers.append(max_text)
          questions.append(qa['question'])
    df = pd.DataFrame({'questions': questions, 'answers': answers})
    df['ans_lemmas'] = df['answers'].apply(preprocess_line)
    return df

def save_embed_corpus(df, text_column):
    df['ans_embeds'] = df['ans_lemmas'].apply(lambda x: make_ft_embedding(x, model_ft))
    split_df = pd.DataFrame(df['ans_embeds'].tolist(),
                        columns=[f'word{i}' for i in range(300)])
    split_df['doc_name'] = df['answers']
    split_df.to_csv('../corpora/corpus_ft.csv')

# if __name__ == "__main__":
#     df = make_df_from_corpus(path_to_json='../data/questions_about_love.jsonl')
#     save_embed_corpus(df, text_column='answers')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pikachu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df = make_df_from_corpus(path_to_json='../data/questions_about_love.jsonl')

In [13]:
save_embed_corpus(df, text_column='answers')

In [None]:
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')

query = input('Введите фразу, которую хотите найти в корпусе: ')

path_to_corpus = '../corpora/corpus_ft.csv'
df = pd.read_csv(path_to_corpus)

cols = [col for col in df.columns if 'word' in col]
scores = cosine_similarity(model_ft[query].reshape((1,300)), df[cols].as_matrix())[0]
argx = np.argsort(scores)[::-1]
df['doc_name'].to_numpy()[argx.ravel()]

In [6]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny")

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()


def make_df_from_corpus(path_to_json:str):
    with open(path_to_json, 'r') as f:
        qa_corpus = list(f)[:50]
    questions = []
    answers = []
    for qa in qa_corpus:
      qa = json.loads(qa)
      if qa['answers'] != []:
        max_value = -10 ** 6
        max_text = ''
        for answer in qa['answers']:
          if answer['author_rating']['value'] != '':
            cur_value = int(answer['author_rating']['value'])
            if cur_value >= max_value:
              max_text = answer['text']
        if max_text != '':
          answers.append(max_text)
          questions.append(qa['question'])
    df = pd.DataFrame({'questions': questions, 'answers': answers})
    return df

def save_embed_corpus(df, text_column):
    df['ans_embeds'] = df[text_column].apply(lambda x: embed_bert_cls(x, model, tokenizer))
    split_df = pd.DataFrame(df['ans_embeds'].tolist(),
                        columns=[f'word{i}' for i in range(df['ans_embeds'][0].shape[0])])
    split_df['doc_name'] = df[text_column]
    split_df.to_csv('../corpora/corpus_bert.csv')

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
df['ans_embed'] = df['answers'].apply(lambda x: embed_bert_cls(x, model, tokenizer))

KeyboardInterrupt: 

In [None]:
save_embed_corpus(df, text_column='answers')

In [14]:
pwd

'/Users/pikachu/info_search/hw4'

In [16]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
mystem = Mystem()
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')

def make_ft_embedding(line:str):
    emb_list = []
    for word in line.split():
        emb_list .append(model_ft[word])
    emb_list = np.array(emb_list)
    return emb_list.mean(axis=0)


def preprocess_line(text: str) -> str:
    # убираем пунктуацию, оставляем только дефисы
    no_punct = re.sub('[,\?\!\."\:]|[a-zA-Z]+', '', ''.join(text))
    # токенизируем и лемматизируем текст, приводим к нижнему регистру
    lem_words = mystem.lemmatize(text.lower())
    ans = ' '.join([w for w in lem_words if w not in russian_stopwords and w.isalpha()])
    return re.sub('\n', '', ans)

def make_df_from_corpus(path_to_json:str):
    with open(path_to_json, 'r') as f:
        qa_corpus = list(f)[:10000]
    questions = []
    answers = []
    for qa in qa_corpus:
      qa = json.loads(qa)
      if qa['answers'] != []:
        max_value = -10 ** 6
        max_text = ''
        for answer in qa['answers']:
          if answer['author_rating']['value'] != '':
            cur_value = int(answer['author_rating']['value'])
            if cur_value >= max_value:
              max_text = answer['text']
        if max_text != '':
          answers.append(max_text)
          questions.append(qa['question'])
    df = pd.DataFrame({'questions': questions, 'answers': answers})
    df['ans_lemmas'] = df['answers'].apply(preprocess_line)
    return df

def save_embed_corpus(df, text_column):
    df['ans_embeds'] = df['ans_lemmas'].apply(make_ft_embedding)
    split_df = pd.DataFrame(df['ans_embeds'].tolist(),
                        columns=[f'word{i}' for i in range(300)])
    split_df['doc_name'] = df['answers']
    split_df.to_csv('../corpora/corpus_ft.csv')

if __name__ == "__main__":
    df = make_df_from_corpus(path_to_json='../data/questions_about_love.jsonl')
    save_embed_corpus(df, text_column='answers')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pikachu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [None]:
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')

query = input('Введите фразу, которую хотите найти в корпусе: ')

path_to_corpus = '../corpora/corpus_ft.csv'
df = pd.read_csv(path_to_corpus)

cols = [col for col in df.columns if 'word' in col]
scores = cosine_similarity(model_ft[query].reshape((1,300)), df[cols].as_matrix())[0]
argx = np.argsort(scores)[::-1]
df['doc_name'].to_numpy()[argx.ravel()]