In [1]:
# install dependencies
%pip install torch transformers pandas tqdm

Note: you may need to restart the kernel to use updated packages.


In [4]:
# init variables
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

PROCESSING_FILE_PATH = 'resources/cases_2024.json'
PROCESSING_CASE_INDEX = 0
MODELS = ["naver/splade-cocondenser-ensembledistil"]

In [3]:
# load to dataframes
import json
import pandas as pd

with open(PROCESSING_FILE_PATH, 'r', encoding='utf-8') as f:
    cases_data = json.load(f)

cases_df = pd.DataFrame(cases_data)
print(cases_df.head())

                                     id filename primaryLang  \
0  d66a6895-c339-4bd0-9992-790b7b5f4a17      cpa        0132   
1  4aaafdf5-8ac9-4086-b62e-485d250b02bb    court          of   
2  f81236b6-7c88-4337-9701-772651a56abe       ca        writ   
3  0fe6fe07-fd7d-4b4c-a5d3-3644e9c56b56      wrt        0201   
4  b655451f-cad0-4cc4-b5ce-6bc81dbbee30     writ         123   

                                                text  wordCount  
0  Page 1 of 11 \n In the cozy appeal of the demo...       2854  
1  CA/HCC 184/2017  \n \n1 | P a g e  \n  IN THE ...       4330  
2  Page 1 of 11 \n IN THE COURT OF APPEAL OF THE ...       3300  
3  Page 1 of 15 \n IN THE COURT OF APPEAL OF THE ...       4121  
4  1 \n IN THE COURT OF APPEAL OF THE DEMOCRATIC ...       3898  


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tqdm import tqdm
import json

# Load SPLADE model
model_name = MODELS[0]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()

# SPLADE sparse vector generator
def splade_sparse_vector(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs).logits.squeeze(0)  # [seq_len, vocab_size]
    scores = torch.log(1 + torch.relu(outputs))
    max_scores, _ = torch.max(scores, dim=0)  # Max-pooling across sequence
    non_zero_indices = torch.nonzero(max_scores).squeeze(1).tolist()
    non_zero_values = max_scores[non_zero_indices].tolist()
    tokens = tokenizer.convert_ids_to_tokens(non_zero_indices)
    return {
        "indices": tokens,
        "values": non_zero_values
    }

# Process all texts
sparse_vectors = []
for i in tqdm(range(len(df)-520)):
    sparse = splade_sparse_vector(df.loc[i, 'text'])
    sparse_vectors.append({
        "id": df.loc[i, 'id'],
        "sparse_values": sparse,
        "metadata": {"text": df.loc[i, 'text']}
    })

# Print one example
print(json.dumps(sparse_vectors, indent=2))

tokenizer_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

KeyboardInterrupt: 