In [13]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
mystem = Mystem()
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')

def make_ft_embedding(line:str, model_ft):
    emb_list = np.array()
    for word in line.split():
        emb_list.append(model_ft[word])
    return emb_list.mean()


def preprocess_line(text: str) -> str:
    # убираем пунктуацию, оставляем только дефисы
    no_punct = re.sub('[,\?\!\."\:]|[a-zA-Z]+', '', ''.join(text))
    # токенизируем и лемматизируем текст, приводим к нижнему регистру
    lem_words = mystem.lemmatize(text.lower())
    ans = ' '.join([w for w in lem_words if w not in russian_stopwords and w.isalpha()])
    return re.sub('\n', '', ans)

def make_df_from_corpus(path_to_json:str):
    with open(path_to_json, 'r') as f:
        qa_corpus = list(f)[:50000]
    questions = []
    answers = []
    for qa in qa_corpus:
      qa = json.loads(qa)
      if qa['answers'] != []:
        max_value = -10 ** 6
        max_text = ''
        for answer in qa['answers']:
          if answer['author_rating']['value'] != '':
            cur_value = int(answer['author_rating']['value'])
            if cur_value >= max_value:
              max_text = answer['text']
        if max_text != '':
          answers.append(max_text)
          questions.append(qa['question'])
    df = pd.DataFrame({'questions': questions, 'answers': answers})
    df['ans_lemmas'] = df['answers'].apply(preprocess_line)
    return df

def save_embed_corpus(df, text_column, model_ft):
    df['ans_embeds'] = df['ans_lemmas'].apply(lambda x: make_ft_embedding(x, model_ft))
    split_df = pd.DataFrame(df['ans_embeds'].tolist(),
                        columns=[f'word{i}' for i in range(300)])
    split_df['doc_name'] = df['answers']
    split_df.to_csv('../corpora/corpus_ft.csv')

if __name__ == "__main__":
    df = make_df_from_corpus(path_to_json='../data/questions_about_love.jsonl')
    save_embed_corpus(df, 'answers', model_ft)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pikachu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TypeError: array() missing required argument 'object' (pos 1)

In [14]:
pwd

'/Users/pikachu/.Trash/info_search 22.13.24/hw4'

In [7]:
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')

query = input('Введите фразу, которую хотите найти в корпусе: ')

path_to_corpus = 'corpus_ft.csv'
df = pd.read_csv(path_to_corpus)

cols = [col for col in df.columns if 'word' in col]
scores = cosine_similarity(model_ft[query].reshape((1,300)), df[cols].as_matrix())[0]
argx = np.argsort(scores)[::-1]
df['doc_name'].to_numpy()[argx.ravel()]

Введите фразу, которую хотите найти в корпусе:  скажи что сейчас модно с короткими ногтями


  


array(['скажи что сейчас модно с короткими ногтями',
       'С короткой, если нужен короткий...', 'сейчас можно', ...,
       'Сатурн виноват', 'А ты не виноватая, и он сам пришёл??? ))',
       'Подарите ему персональные стихи! Стихи на праздник'], dtype=object)

In [15]:
import json
import gensim
from gensim.models.wrappers import FastText
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from nltk.corpus import stopwords
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
mystem = Mystem()
model_ft = gensim.models.KeyedVectors.load('araneum_none_fasttextcbow_300_5_2018.model')

def make_ft_embedding(line:str):
    emb_list = []
    for word in line.split():
        emb_list .append(model_ft[word])
    emb_list = np.array(emb_list)
    return emb_list.mean(axis=0)


def preprocess_line(text: str) -> str:
    # убираем пунктуацию, оставляем только дефисы
    no_punct = re.sub('[,\?\!\."\:]|[a-zA-Z]+', '', ''.join(text))
    # токенизируем и лемматизируем текст, приводим к нижнему регистру
    lem_words = mystem.lemmatize(text.lower())
    ans = ' '.join([w for w in lem_words if w not in russian_stopwords and w.isalpha()])
    return re.sub('\n', '', ans)

def make_df_from_corpus(path_to_json:str):
    with open(path_to_json, 'r') as f:
        qa_corpus = list(f)[:50]
    questions = []
    answers = []
    for qa in qa_corpus:
      qa = json.loads(qa)
      if qa['answers'] != []:
        max_value = -10 ** 6
        max_text = ''
        for answer in qa['answers']:
          if answer['author_rating']['value'] != '':
            cur_value = int(answer['author_rating']['value'])
            if cur_value >= max_value:
              max_text = answer['text']
        if max_text != '':
          answers.append(max_text)
          questions.append(qa['question'])
    df = pd.DataFrame({'questions': questions, 'answers': answers})
    df['ans_lemmas'] = df['answers'].apply(preprocess_line)
    return df

def save_embed_corpus(df, text_column):
    df['ans_embeds'] = df['ans_lemmas'].apply(make_ft_embedding)
    split_df = pd.DataFrame(df['ans_embeds'].tolist(),
                        columns=[f'word{i}' for i in range(300)])
    split_df['doc_name'] = df['answers']
    split_df.to_csv('../corpora/corpus_ft.csv')

if __name__ == "__main__":
    df = make_df_from_corpus(path_to_json='../data/questions_about_love.jsonl')
    save_embed_corpus(df, text_column='answers')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pikachu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '../corpora/corpus_ft.csv'

In [16]:
pwd

'/Users/pikachu/.Trash/info_search 22.13.24/hw4'