In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from nltk import ngrams
from collections import Counter
from rapidfuzz import fuzz
import os
import fasttext
from pandarallel import pandarallel
from pymystem3 import Mystem
from string import punctuation
import nltk 
import multiprocessing
from kaznlp.morphology.analyzers import AnalyzerDD
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from fastbm25 import fastbm25
from transformers import AutoTokenizer, AutoModel
import torch
import spacy
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

cores = int(multiprocessing.cpu_count()/2)
pandarallel.initialize(progress_bar=False, nb_workers=cores)

os.environ['TOKENIZERS_PARALLELISM']='false' #без этого возникает конфликт между bert и pandarallel

test = pd.read_csv("data/epir_test.csv")

df1 = pd.read_csv("data/epir_train/articles.csv")
df2 = pd.read_csv("data/epir_train/life_situations.csv")
df3 = pd.read_csv("data/epir_train/news.csv")
df4 = pd.read_csv("data/epir_train/services.csv")
df1_orig = df1.copy()
df2_orig = df2.copy()
df3_orig = df3.copy()
df4_orig = df4.copy()
df1['file'] = "articles"
df2['file'] = "life_situations"
df3['file'] = 'news'
df4['file'] = 'services'
### ПРЕДОБРАБОТКА
#articles
df1 = df1.rename({'content':'body'}, axis=1)
#news
df2['title'] = df2['title_main'].fillna("") + " " + df2['title_sub'].fillna("")
df2 = df2.rename({'instruction':'body', 
            'URL':'url'}, axis=1)
df2 = df2[['Unnamed: 0', 'id', 'sys_lang', 'title','body', 'url','file']]
#news
df3 = df3.drop('short_description', axis=1)
#services
df4 = df4.drop("title", axis=1)
df4 = df4.rename({'full_title':'title'}, axis=1)
df4['body'] = df4['description'].fillna("")+" "+df4['result_description'].fillna("")
df4 = df4[['Unnamed: 0', 'id', 'sys_lang', 'title','body', 'url','file']]

### Объединение
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

df['text'] = df['title'].fillna("")+ " " + df['body'].fillna("")
df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.lower()
df = df[df['text']!=''].copy()
df['text'] = df['text'].str.replace('ақпарат өзектендіру сатысында.','', regex=False)
df['text'] = df['text'].str.replace('information is being updated.','', regex=False)
df['text'] = df['text'].str.replace('информация находится на стадии актуализации. ','', regex=False)
df['text'] = df['text'].str.replace("Dear citizens of the Republic of Kazakhstan! In order to prevent the spread of coronavirus infection in the Republic of Kazakhstan, the provision of services on paper media is possible through online reservations in the service centers of the population for the period of quarantine.Public services can be obtained online, as well as some services are available in the mobile app EgovMbile, Telegram-bot eGovKzBot2 .0 and social networks Facebook and VK.".lower(), '', regex=False)
df['text'] = df['text'].str.replace("Уважаемые граждане Республики Казахстан! Для предотвращения распространения коронавирусной инфекции в Республике Казахстан оказание услуг на бумажных носителях возможно через онлайн бронирование в ЦОНах на период карантина.Государственные услуги можно получить онлайн, а также некоторые услуги и сервисы доступны в мобильном приложении EgovMobile, Telegram-боте eGovKzBot2.0 и социальных сетях Facebook и VK.".lower(), '', regex=False)
df['text'] = df['text'].str.replace("Құрметті Қазақстан Республикасының азаматтары! Қазақстан Республикасында коронавирустық инфекцияның таралуын болдырмау үшін, қағаз түрінде қызмет көрсету карантин уақытында ХҚКО-на онлайн брондау арқылы көрсетілетін болады.Мемлекеттік қызметтерді онлайн алуға болады, сонымен қатар кейбір қызметтер мен сервистер ЕgovМоbile мобильді қосымшасында, eGovKzBot2.0 Telegram-ботында және Facebook пен VK әлеуметтік жүйелерінде қолжетімді.".lower(), '', regex=False)

df['text'] = df['text'].str.strip()

# #очищаем от только p#.### без текста
df = df[~df['text'].str.contains('^p[\d\.]+$',regex=True)].copy()

#убираем другие языки
df = df[df['sys_lang'].isin(['ru','kk','qq','en'])]

In [9]:
#определитель языка
#TODO: определить казахскую латиницу
#можно: перевести латиницу на кириллицу, и прогнать латиницу и кириллицу
#https://fasttext.cc/docs/en/language-identification.html
model = fasttext.load_model('lid.176.bin')
def detect_language(query):
    for lang, prob in zip(*model.predict(query, k=176)):
        lang = lang.split("__")[-1]
        if lang in {'ru','kk','en'}:
            return lang




In [10]:
test['language'] = test['question'].apply(lambda x: detect_language(x))
test['language'].value_counts()

In [12]:
###TODO: add qq kazakh обработка
df = df[df['sys_lang'] != 'qq'].copy()

In [43]:
###need to download spacy en
# import sys
# !{sys.executable} -m spacy download en

In [17]:
#english lemmatizer
en_stopwords = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')

#russian lemmatizer
russian_stopwords = set(stopwords.words("russian"))
ru_stem = Mystem() 
punctuation = set(punctuation)

#kaz 'lemmatizer'
analyzer = AnalyzerDD()
analyzer.load_model(os.path.join('kaznlp', 'morphology', 'mdl'))


def lemmatize_ru(sentence):
    tokens = ru_stem.lemmatize(sentence)
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    return " ".join(tokens).strip()

def lemmatize_kk(sentence):
    tokens = word_tokenize(sentence)
    return " ".join([analyzer.analyze(token)[1][0].split("_")[0] for token in tokens if token not in punctuation])

def lemmatize_en(sentence):
    doc = nlp(sentence)
    return " ".join([token.lemma_ for token in doc \
          if (token.lemma_ not in en_stopwords) and \
          (token.lemma_ not in punctuation)])




[nltk_data] Downloading package stopwords to /home/maxat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
df['text_stemmed'] = df['text'].copy()

df.loc[df['sys_lang']=='ru','text_stemmed'] = df.loc[df['sys_lang']=='ru','text_stemmed'].parallel_apply(lambda x: lemmatize_ru(x))

df.loc[df['sys_lang']=='kk','text_stemmed'] = df.loc[df['sys_lang']=='kk','text_stemmed'].parallel_apply(lambda x: lemmatize_kk(x))

df.loc[df['sys_lang']=='en','text_stemmed'] = df.loc[df['sys_lang']=='en','text_stemmed'].parallel_apply(lambda x: lemmatize_en(x))

#на всякий случай еще предобработка
df['text_stemmed'] = df['text_stemmed'].str.lower()
df['text_stemmed'] = df['text_stemmed'].str.strip()


In [23]:
#разделение датасета на языки
lang2df = {}
for lang, dfTemp in df.groupby("sys_lang"):
    lang2df[lang] = {}
    lang2df[lang]['df'] = dfTemp
    
for lang in tqdm(lang2df):
    df_lang = lang2df[lang]['df']
    corpus = df_lang['text_stemmed'].values.tolist()
    ids = df_lang['Unnamed: 0'].values.tolist()
    lang2df[lang]['ids'] = ids
    lang2df[lang]['vectorizer'] = TfidfVectorizer()
    lang2df[lang]['X'] = lang2df[lang]['vectorizer'].fit_transform(corpus)
    corpus = [sent.split() for sent in corpus]
    lang2df[lang]['bm25'] = fastbm25(corpus, PARAM_K1=1.5, PARAM_B = 0.75, EPSILON=0.25) #default parameters, need to play
    

100%|██████████| 3/3 [01:34<00:00, 31.58s/it]


In [26]:
##Модель для русского языка
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-10)
    return sum_embeddings / sum_mask

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
ru_bert = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")

def get_ru_embs(sentences):
    #Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, max_length=512, truncation=True, return_tensors='pt')
    #Compute token embeddings
    with torch.no_grad():
        model_output = ru_bert(**encoded_input)
    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings

In [27]:
#модель для англ
from sentence_transformers import SentenceTransformer, util
bert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')
# bert_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased', device='cpu')

In [34]:
#модель для каз
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-10)
    return sum_embeddings / sum_mask

tokenizer_kk = AutoTokenizer.from_pretrained("kz-transformers/kaz-roberta-conversational")
kk_bert = AutoModel.from_pretrained("kz-transformers/kaz-roberta-conversational")

def get_kk_embs(sentences):
    #Tokenize sentences
    encoded_input = tokenizer_kk(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')
    #Compute token embeddings
    with torch.no_grad():
        model_output = kk_bert(**encoded_input)
    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings

Some weights of RobertaModel were not initialized from the model checkpoint at kz-transformers/kaz-roberta-conversational and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Inference

In [45]:
from sklearn.metrics.pairwise import linear_kernel

test = pd.read_csv("data/epir_test.csv")
test['index'] = None
K = 3
service_coef = 1.2
for row_ind in tqdm(test.index):
    row = test.loc[row_ind]
    query = row['question'].lower()
    query_orig = query
    lang = detect_language(query)
    apply_service_coef = False
    if lang=='ru':
        query = lemmatize_ru(query)
    elif lang=='en':
        query = lemmatize_en(query)
    elif lang=='kk':
        query = lemmatize_kk(query)
    query_vec = lang2df[lang]['vectorizer'].transform([query])
    scores = linear_kernel(query_vec, lang2df[lang]['X'])[0]
    
    top_k_indices = np.argpartition(scores, -K)[-K:]
    top_k_indices = top_k_indices[np.argsort(scores[top_k_indices])][::-1]
    top_k_ids = [lang2df[lang]['ids'][ind] for ind in top_k_indices]
    dfTemp = df.set_index("Unnamed: 0").loc[top_k_ids].copy()
    if lang =='en':
        query_bert_vec = bert_model.encode([query_orig])
        bert_vecs = bert_model.encode(dfTemp['text'].values.tolist())
        bert_scores = linear_kernel(query_bert_vec, bert_vecs)[0]
        dfTemp['total_score'] = bert_scores
    elif lang=='kk':
        query_bert_vec = get_kk_embs([query_orig])
        bert_vecs = get_kk_embs(dfTemp['text'].values.tolist())
        bert_vecs = bert_vecs/torch.norm(bert_vecs, dim=1).reshape(-1,1)
        query_bert_vec = query_bert_vec/torch.norm(query_bert_vec, dim=1).reshape(-1,1)
        bert_scores = linear_kernel(query_bert_vec, bert_vecs)[0]
        dfTemp['total_score'] = bert_scores
    # elif lang=='ru':
    #     query_bert_vec = get_ru_embs([query_orig])
    #     bert_vecs = get_ru_embs(dfTemp['text'].values.tolist())
    #     bert_vecs = bert_vecs/torch.norm(bert_vecs, dim=1).reshape(-1,1)
    #     query_bert_vec = query_bert_vec/torch.norm(query_bert_vec, dim=1).reshape(-1,1)
    #     bert_scores = linear_kernel(query_bert_vec, bert_vecs)[0]
    #     dfTemp['total_score'] = bert_scores
    else:
        dfTemp['tf_idfscore'] = scores[top_k_indices]

        topK_corpus = [sent.split() for sent in dfTemp['text_stemmed'].values.tolist()]
        dfTemp['bm25_score'] = [lang2df[lang]['bm25'].similarity_bm25(query.split(),sent) for sent in topK_corpus]
        

        dfTemp['WRatio'] = dfTemp['text'].apply(lambda x: fuzz.WRatio(query_orig,x))
        dfTemp['token_sort_ratio'] = dfTemp['text'].apply(lambda x: fuzz.token_sort_ratio(query_orig,x))
        dfTemp['ratio'] = dfTemp['text'].apply(lambda x: fuzz.ratio(query_orig,x))

        score_columns = ['tf_idfscore','bm25_score']
        
        #можно суммировать еще по reciprocal rank
        # for col in score_columns:
        #     dfTemp[col] = 1/(K-dfTemp[col].argsort())
        dfTemp['bm25_score'] *= 0.4
        dfTemp['total_score'] = dfTemp[score_columns].sum(axis=1)
    
    ##коррекция для services и life_situations
    dfTemp.loc[dfTemp['file'].isin(['services','life_situations']),'total_score'] = service_coef*dfTemp.loc[dfTemp['file'].isin(['services','life_situations']),'total_score']

    dfTemp = dfTemp.sort_values('total_score', ascending=False)
    results = dfTemp.index.tolist()
    test.loc[row_ind, 'index'] = ",".join(map(str, results))

100%|██████████| 124/124 [00:20<00:00,  6.15it/s]


In [47]:
test['index'] = test['index'].apply(lambda x: list(map(int, x.split(','))) )

In [48]:
pd.set_option("display.max_rows", 125)

In [49]:
test_vars = test.set_index('id')['index'].explode().reset_index()
test_vars = test_vars.rename({'level_0':'id'}, axis=1)
test_vars = test_vars.merge(test[['id','question']],
                how='left', on='id')
test_vars = test_vars.merge(df[['Unnamed: 0','text','file']].rename({'Unnamed: 0':'index'}, axis=1),
                       how='left', on='index')
test_vars['rank'] = test_vars.groupby('id').cumcount()+1
test_vars

Unnamed: 0,id,index,question,text,file,rank
0,1,2049,Алғашқы медициналық-санитариялық көмек көрсете...,алғашқы медициналық-санитариялық көмек көрсете...,services,1
1,1,179493,Алғашқы медициналық-санитариялық көмек көрсете...,сабақ 1. алғашқы медициналық көмек,news,2
2,1,139729,Алғашқы медициналық-санитариялық көмек көрсете...,денсаулық сақтау министрлігінің басшысы қостан...,news,3
3,2,647,How can I make an online reservation for servi...,issuance of id cards to stateless persons and ...,services,1
4,2,392,How can I make an online reservation for servi...,obtaining a quarantine certificate for movemen...,services,2
...,...,...,...,...,...,...
367,123,204849,Can I change my name online in Kazakhstan?,"how to change surname, name or patronymic chan...",life_situations,2
368,123,158059,Can I change my name online in Kazakhstan?,changes in legislation,news,3
369,124,128617,Documents needed for a name change in Kazakhstan.,"about changing the name, patronymic, surname t...",news,1
370,124,204850,Documents needed for a name change in Kazakhstan.,"how to change surname, name or patronymic what...",life_situations,2


In [50]:
test_vars[test_vars['rank'] ==1]

Unnamed: 0,id,index,question,text,file,rank
0,1,2049,Алғашқы медициналық-санитариялық көмек көрсете...,алғашқы медициналық-санитариялық көмек көрсете...,services,1
3,2,647,How can I make an online reservation for servi...,issuance of id cards to stateless persons and ...,services,1
6,3,155669,Online services Egov,new service for the pregnant available on egov...,news,1
9,4,156093,Telegram bot Egov servces,telegram bot features we are already used to t...,news,1
12,5,179713,Where can I find the latest updates and inform...,what public services can be obtained online wi...,news,1
15,6,116,How can I obtain a certificate from an antitub...,issuance of certificate from antituberculous o...,services,1
18,7,116,Process for obtaining an abstract from an anti...,issuance of certificate from antituberculous o...,services,1
21,8,116,Are there specific eligibility criteria or req...,issuance of certificate from antituberculous o...,services,1
24,9,1562,Разрешение на экспорт или импорт определенных ...,выдача разрешения на экспорт и (или) импорт от...,services,1
27,10,1562,Процесс получения разрешения на экспорт и/или ...,выдача разрешения на экспорт и (или) импорт от...,services,1


In [51]:
sub = test_vars[test_vars['rank'] ==1].copy()
sub['index'] = sub['index'].astype(int)
sub = sub.reset_index(drop=True)
sub = sub.set_index('id')
sub[['index']].to_csv("submissions/sub44.csv")