# Pack Search Models

Pack NN, tokenaizers, index, passages

In [1]:
# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [17]:
ARCHIVE_FOLDER = './search_data'
PASSAGES_FILENAME = f'{ARCHIVE_FOLDER}/passages'
INDEX_FILENAME = f'{ARCHIVE_FOLDER}/index'
TOKENIZER_FILENAME = f'{ARCHIVE_FOLDER}/tokenizer'
MODEL_FILENAME = f'{ARCHIVE_FOLDER}/model'

## Save search passages

In [2]:
from datasets import load_dataset

wiki40b_passages = load_dataset('wiki_snippets', name='wiki40b_en_100_0', cache_dir='./datasets')['train']

Reusing dataset wiki_snippets (./datasets/wiki_snippets/wiki40b_en_100_0/1.0.0/d152a0e6a420c02b9b26e7f75f45fb54c818cae1d83e8f164f0b1a13ac7998ae)


In [3]:
wiki40b_passages.save_to_disk(PASSAGES_FILENAME)

### Check passages

In [4]:
from datasets import Dataset

passages = Dataset.load_from_disk(PASSAGES_FILENAME)

passages[0]

{'_id': '{"datasets_id": 0, "wiki_id": "Q1294448", "sp": 2, "sc": 0, "ep": 6, "ec": 610}',
 'datasets_id': 0,
 'wiki_id': 'Q1294448',
 'start_paragraph': 2,
 'start_character': 0,
 'end_paragraph': 6,
 'end_character': 610,
 'article_title': 'Ági Szalóki',
 'section_title': 'Life',
 'passage_text': "Ági Szalóki Life She started singing as a toddler, considering Márta Sebestyén a role model. Her musical background is traditional folk music; she first won recognition for singing with Ökrös in a traditional folk style, and Besh o droM, a Balkan gypsy brass band. With these ensembles she toured around the world from the Montreal Jazz Festival, through Glastonbury Festival to the Théatre de la Ville in Paris, from New York to Beijing.\nSince 2005, she began to pursue her solo career and explore various genres, such as jazz, thirties ballads, or children's songs.\nUntil now, three of her six released albums"}

## Save index

In [5]:
from shutil import copyfile
copyfile('./wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat', INDEX_FILENAME)

'./search_data/index'

### Check index

In [10]:
import faiss
import numpy as np

wiki40b_passage_reps = np.memmap(
    INDEX_FILENAME,
    dtype='float32', mode='r',
    shape=(wiki40b_passages.num_rows, 128)
)

wiki40b_index_flat = faiss.IndexFlatIP(128)

wiki40b_index_flat.add(wiki40b_passage_reps)

## Save model and tokenaizer

In [12]:
from transformers import AutoModel, AutoTokenizer

qar_tokenizer = AutoTokenizer.from_pretrained('yjernite/retribert-base-uncased', cache_dir='./tokenaizers')
qar_model = AutoModel.from_pretrained('yjernite/retribert-base-uncased', cache_dir='./models')
_ = qar_model.eval()

Some weights of RetriBertModel were not initialized from the model checkpoint at yjernite/retribert-base-uncased and are newly initialized: ['bert_query.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
qar_tokenizer.save_pretrained(TOKENIZER_FILENAME)
qar_model.save_pretrained(MODEL_FILENAME)

('./search_data/tokenizer/tokenizer_config.json',
 './search_data/tokenizer/special_tokens_map.json',
 './search_data/tokenizer/vocab.txt',
 './search_data/tokenizer/added_tokens.json',
 './search_data/tokenizer/tokenizer.json')

### Check model and tokenaizer

In [15]:
qar_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_FILENAME, use_fast=False)
qar_model = AutoModel.from_pretrained(MODEL_FILENAME)
_ = qar_model.eval()

## Pack and upload

In [18]:
import os
import tarfile

def pack_folder(source_dir = '', file_name = ''):
    
    with tarfile.open(file_name + '.tar.gz', 'w:gz') as f:
        f.add(source_dir, arcname = os.path.basename(source_dir))
    
    return f"{os.getcwd()}/{file_name}.tar.gz"

model_filename = pack_folder(ARCHIVE_FOLDER, 'search_data')
print('model packed to', model_filename)

model packed to /work/search_data.tar.gz


In [20]:
from dotenv import load_dotenv
load_dotenv()

True

In [22]:
import boto3


def upload_file(model_path='', s3_bucket='', s3_filename=''):
    s3 = boto3.session.Session()
    client = s3.client('s3')
    return client.upload_file(model_path, s3_bucket, s3_filename)
    
upload_file(model_filename, 'asqa-search-models', 'DPR/en.tar.gz')