# Import Packages and Installations

In [1]:
# Install required packages for Albert model
!pip install -q sentencepiece
!pip install -q transformers
!pip install -q tokenizers
!pip install -qU hazm
!pip install -qU clean-text[gpl]

!mkdir resources
!wget -q "https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip" -P resources
!unzip -qq resources/resources-0.5.zip -d resources

!pip install faiss-cpu

!rm -rf /content/4ccae468eb73bf6c4f4de3075ddb5336
!rm -rf /content/preproc
!rm preprocessing.py utils.py
!mkdir -p /content/preproc
!git clone https://gist.github.com/4ccae468eb73bf6c4f4de3075ddb5336.git /content/preproc/
!mv /content/preproc/* /content/
!rm -rf /content/preproc




[K     |████████████████████████████████| 1.2MB 8.6MB/s 
[K     |████████████████████████████████| 1.8MB 7.4MB/s 
[K     |████████████████████████████████| 3.2MB 34.7MB/s 
[K     |████████████████████████████████| 890kB 55.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 317kB 9.2MB/s 
[K     |████████████████████████████████| 235kB 15.2MB/s 
[K     |████████████████████████████████| 1.4MB 17.5MB/s 
[?25h  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 71kB 5.7MB/s 
[K     |████████████████████████████████| 133kB 12.3MB/s 
[K     |████████████████████████████████| 245kB 12.7MB/s 
[?25h  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/1e/a8/ed1601e6e94702ad691465bd1bead221dd2984f741bf384011b4dc59130e/faiss_

In [6]:
import numpy as np 
import pandas as pd
import re
from tqdm import tqdm 
import os
# import yake
from hazm import stopwords_list
from __future__ import unicode_literals
from hazm import *
import pickle
import requests
from termcolor import colored
from preprocessing import cleaning

import time

import hazm
import plotly.express as px
import plotly.graph_objects as go
from itertools import chain
# for the models
import tensorflow as tf
import matplotlib.pyplot as plt


# BERT base
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from __future__ import unicode_literals
import torch.nn.functional as FloatingPointError

import faiss
# Albert
from transformers import AutoConfig, AutoTokenizer, AutoModel
from transformers import TFAlbertModel

Albert_path = "m3hrdadfi/albert-fa-base-v2"


# Loading the dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

data_address = '/content/drive/MyDrive/COVID-PSS.xls'
keys_address = '/content/drive/MyDrive/keywords_final_distilled_NE (1).pickle'
cleaned_titles_address = '/content/drive/MyDrive/title_cleaned_without_corona_2.pkl'


df = pd.read_csv(data_address)
list_t = pd.read_pickle(cleaned_titles_address)

keywords = pd.read_pickle(keys_address)
keywords = [v for k,v in keywords.items()]



assert len(keywords) == len(df)
df['keywords'] = keywords
df.drop(columns=['img', 'link'], inplace=True)


Mounted at /content/drive


In [4]:
# preparing inputs for semantic

corpora = []
for i in range(len(list_t)):

    keys = '[SEP]'.join(keywords[i])
    corpora.append(' '.join([list_t[i], keys]))

# Helpers

In [25]:
#-------------------getting some description----------#

def tokenized_info(corpora, tokenizer, config):
    """
    Gets the corpus and outputs the info related to the number of tokens
    of all records
    """

    print(f'Total number of records: {len(corpora)}')

    tokenized_corpora_lengths = [len(tokenizer.tokenize(corp)) for corp in corpora]
    max_, min_, avg_ = max(tokenized_corpora_lengths),\
                    min(tokenized_corpora_lengths),\
                    np.ceil(np.mean(tokenized_corpora_lengths))

    print(colored('The maximum length: ', 'red'), max_)
    print(colored('The minimum length: ', 'green'), min_)
    print(colored('The average length: ', 'white'), avg_)


    allowed_len = config.max_position_embeddings
    not_allowed = len([i for i in tokenized_corpora_lengths if i>allowed_len])
    print('In total ', colored(not_allowed, 'blue'), f' records\nare longer than the max_len_seq wich is {allowed_len}')


    # --------------------- The Encoder------------------#

def create_input_batches(corpora, tokenizer, 
                         batch_size=128, max_len=512):
    """
    Gets the corpora and outputs a number of batches with 
    input ids
    attention masks
    token type ids

    For the semantic search we only get the first two
    """

    all_inputs = {}

    for i in tqdm(range(0, len(corpora), batch_size),
                  position=0, leave=True):

        tokens = tokenizer.batch_encode_plus(
            corpora[i:i+batch_size],
            padding='max_length',
            truncation = True,
            max_length = max_len,
            add_special_tokens = True,
            pad_to_max_length=True,
            )
        
        all_inputs['Batch_'+ str(int(i/batch_size))] = tokens
    print('\nTotal number of batches: ', len(all_inputs))
    return all_inputs


# ------------------------ Mean pooling on GPU--------------

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('\nworking on', device)
    
def get_embeddings(all_inputs, model):
    """
    gets batches of input and outputs the mean of all tokens in a sentence
    which has 768 elements for each sentence.
    first we turn each required input batch into a tensor
    then give it to the model
    and get the first of all hidden states from it
    then we add all to mean_of_all_batches

    shape of the output:

    [# layers, # batches, # tokens, # features]
    """
    
    model_gpu = model.to(device)
    mean_of_all_batches = []

    for i in tqdm(range(len(all_inputs)), leave = True, position = 0):
        #print(f'Batch {i}')
        input_ids_batch = torch.tensor(all_inputs['Batch_'+ str(i)].input_ids)
        attention_masks_batch = torch.tensor(all_inputs['Batch_'+ str(i)].attention_mask)

        input_ids_d = input_ids_batch.to(device)
        masks_d = attention_masks_batch.to(device)

        with torch.no_grad(): 
            # print('went into no grad')
            outputs = model_gpu(input_ids_d, masks_d)  
            # print('went into the model.')
            hidden_states = outputs[2][0]
            # print('got the hidden states')
    # mean_i = torch.mean(hidden_states[0], 0)


        means_for_batch_i = []
        for j in range(len(hidden_states)):
            mean_j = torch.mean(hidden_states[j], 0)
            means_for_batch_i.append(mean_j)

        mean_of_all_batches.append(means_for_batch_i)

    mean_of_all_batches = list(chain.from_iterable(mean_of_all_batches))
    print('\nTotal number of sentences: ', len(mean_of_all_batches))
    return mean_of_all_batches
    # return mean_i


# --------------------------- query 🤔--------------------#


def get_query_embeddings(question, tokenizer,
                         model, max_len=512):

    tokenized = tokenizer.encode_plus(
        question,
        padding='max_length',
        truncation = True,
        max_length = max_len,
        add_special_tokens = True,
        pad_to_max_length=True,
        )
    tokens_tensor = torch.tensor([tokenized.input_ids])
    attention_masks = torch.tensor([tokenized.attention_mask])
    
    input_ids_d = tokens_tensor.to(device)
    masks_d = attention_masks.to(device)
    with torch.no_grad(): 
        outputs = model(input_ids_d, masks_d)  
        hidden_states = outputs[2][0]

    mean_i = torch.mean(hidden_states[0], 0)
    return mean_i



#--------------------------FAISS cosine sim for one example--------------------#

def get_FAISS_cosine_results(sent_embs, question, tokenizer,
                     model, max_len=512, top_n = 10,
                      print_results=True ):
    
    query_embeddings = get_query_embeddings(question, tokenizer,
                         model, max_len=512)
    
    index = faiss.index_factory(768, "Flat", faiss.METRIC_INNER_PRODUCT)

    emb_cor = np.array([i.cpu().numpy() for i in sent_embs])
    faiss.normalize_L2(emb_cor)
    index.add(emb_cor)

    emb_que = np.array([query_embeddings.cpu().numpy()])
    faiss.normalize_L2(emb_que)

    top_k = index.search(emb_que, top_n)

    matches = []
    if print_results == True:

        print('Top results!')
        n = 0
        for score, idx in zip(top_k[0][0], top_k[1][0]):
            print(colored(f'{n}: {idx}th', 'blue'),' corpus with score', colored( f'{score:.5f}:\n', 'blue'),  corpora[idx])
            matches.append(idx)
            n+=1

    return matches



#--------------------------FAISS L2 distance for one example--------------------#

def get_FAISS_L2_results(sent_embs, question, tokenizer,
                     model, max_len=512, top_n = 10,
                      print_results=True ):
    
    query_embeddings = get_query_embeddings(question, tokenizer,
                         model, max_len=512)
    

    emb_cor = np.array([i.cpu().numpy() for i in sent_embs])
    emb_que = np.array([query_embeddings.cpu().numpy()])


    index = faiss.IndexIDMap(faiss.IndexFlatIP(768))

    index.add_with_ids(np.array(emb_cor), np.array(range(0, len(emb_cor))))

    faiss.write_index(index, 'corona_corpora')
    index = faiss.read_index('corona_corpora')

    top_k = index.search(emb_que, top_n)

    matches = []
    if print_results == True:

        n = 0
        for score, idx in zip(top_k[0][0], top_k[1][0]):
            print(colored(f'{n}: {idx}th', 'blue'),' corpus with score', colored( f'{score:.5f}:\n', 'blue'),  corpora[idx])
            matches.append(idx)
            n+=1

    return matches


    
# phrases that need to be removed from titles
corona_phrases = ['کرونایی', 'کروناست' ,'کرونا', 'شیوع', 'بحران', 'ویروس',
                  'ویروس جدید', 'coronavirus', 'corona', 'کووید-19 ', 
                  'کووید', 'بیماری', 'بیمارانی', 'بیماران', '-۱۹', ' وی ', '19', '۱۹',
                  ' بیمار ', 'كرونا', 'كوويد', 'ويروس', r'(\s+)',]


normalizer = hazm.Normalizer()

def clean(text):
    """Cleans the titles for the semantic models"""
    for pattern in corona_phrases:
        text = re.sub(pattern, " ", text)

    text = re.sub(' +[\w] +', " ", text)
    text = normalizer.normalize(text)

    return text


#---------------------------------- get the batch results for this model-----------------------#

def get_resutls(questions, tokenizer, model, top_n):
    
    results = []
    i=0
    for question in questions:

        print(colored(f'{i}: ', 'blue'), question)
        # print('question type', type(question))
        i+=1

        # we give the cleaned question to the semantic model 
        question_cleaned = clean(question)

        question_emb = get_query_embeddings(question_cleaned, tokenizer,
                         model, max_len=512)
        
        emb_que = np.array([question_emb.cpu().numpy()])
        faiss.normalize_L2(emb_que)

        top_k = index.search(emb_que, top_n)
        indices = []
        scores = []

        # saving all the reults in a dictionary
        for score, idx in zip(top_k[0][0], top_k[1][0]):
            
            indices.append(idx)
            scores.append(score)

        results.append({'question':question,
                        'index':indices,
                        'score':scores})
    return results


working on cuda


# Model

In [8]:
albert_config = AutoConfig.from_pretrained(Albert_path)
albert_tokenizer = AutoTokenizer.from_pretrained(Albert_path)
albert_model = AutoModel.from_pretrained(Albert_path, 
                                 output_hidden_states = True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1882978.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=73062448.0, style=ProgressStyle(descrip…




In [9]:
albert_model.eval()
print('Model is set on the evaluation mode.')

Model is set on the evaluation mode.


In [10]:
tokenized_info(corpora, albert_tokenizer, albert_config)

Total number of records: 3536
[31mThe maximum length: [0m 1888
[32mThe minimum length: [0m 11
[37mThe average length: [0m 135.0
In total  [34m98[0m  records
are longer than the max_len_seq wich is 512


In [11]:
all_inputs_albert = create_input_batches(corpora, albert_tokenizer, 
                         batch_size=128, max_len=512)

100%|██████████| 28/28 [00:01<00:00, 16.41it/s]


Total number of batches:  28





In [12]:
mean_of_all_sentences_albert = get_embeddings(all_inputs_albert, model=albert_model)

100%|██████████| 28/28 [02:34<00:00,  5.53s/it]


Total number of sentences:  3536





# Single Qustion

In [None]:
question = 'واکسن تا چه حد موثر است'

albert_indices = get_FAISS_cosine_results(sent_embs = mean_of_all_sentences_albert,
                         question= question,
                         tokenizer=albert_tokenizer,
                         model=albert_model,
                         max_len=512,
                         top_n = 50,
                         print_results=True )

# In Batches

In [26]:
questions = pd.read_pickle('/content/drive/MyDrive/CoPer paper-Models/Sample Queries/Titles_with_Corona.pkl')


index = faiss.index_factory(768, "Flat", faiss.METRIC_INNER_PRODUCT)
emb_cor = np.array([i.cpu().numpy() for i in mean_of_all_sentences_albert])
faiss.normalize_L2(emb_cor)
index.add(emb_cor)


results = get_resutls(questions, tokenizer=albert_tokenizer, model=albert_model, top_n=50)

[34m0: [0m دردسر وکلا با کرونابحران کرونا دادگاه‌‌ها را تعطیل می‌ کند
[34m1: [0m ٢٢٨٢ ابتلا ۵۷ فوتی جدید کرونا در کشور خوزستان همچنان در وضعیت قرمز
[34m2: [0m رفتار عجیب ویروس کرونا از طریق افرادی که علائم ندارند منتقل می‌شود
[34m3: [0m بهبودیافتگان کرونا ۲۸ روز بعد از بهبودی پلاسمای خود را اهدا کنند
[34m4: [0m ادامه روند کاهشی فوتی‌های کرونااز سفر به چهار استان خودداری کنید
[34m5: [0m عملکرد برنامه کشوری غربالگری بیماری کم کاری تیروئید در نوزادان در پاندمی کرونا
[34m6: [0m تداوم تعطیلی صنوف پرریسک در پایتخت پلمب قهوه‌خانه به‌خاطر بی‌توجهی به کرونا
[34m7: [0m روند صعودی کرونا در استان سطح بالای ابتلا مرگ در کل کشور
[34m8: [0m در ایران مطلقا از کیت‌های صادر شده به آلمان برای تشخیص کرونا استفاده نمی‌کنیم
[34m9: [0m ۳۲۱ فوتی جدید کرونا در کشورشمار قربانیان از مرز ۵۰هزار تن گذشت
[34m10: [0m وضعیت هشدار کرونا در کرمانشاه ماسک بزنید بی‌خیال سفر شوید
[34m11: [0m پیشرفت در تولید واکسن کرونا در آکسفورد ۱۰ هزار نفر دیگر نیز واکسینه می‌شوند
[34m12: [0m ابتلای نفر از بیما

# Get the results

In [23]:
with open('/content/drive/MyDrive/CoPer paper-Models/Results/AlBert.pkl', 'wb') as f:
    pickle.dump(results, f)