## Read Dataset CSV

In [1]:
import os
from tqdm import tqdm, trange
import pandas as pd
import numpy as np

In [2]:
case_sentence_csv_folder = '/content/drive/Shareddrives/SigmaLaw-WPP/criminal_sentence_dataset/'
csv_file_list = [
  'sentence_dataset_1000_cases.csv', 'sentence_dataset_2000_cases.csv', 'sentence_dataset_3000_cases.csv',
  'sentence_dataset_4000_cases.csv', 'sentence_dataset_5000_cases.csv', 'sentence_dataset_6000_cases.csv',
  'sentence_dataset_7000_cases.csv', 'sentence_dataset_8000_cases.csv', 'sentence_dataset_9000_cases.csv',
  'sentence_dataset_10000_cases.csv',
  'sentence_dataset_11000_cases.csv', 'sentence_dataset_12000_cases.csv',
]

In [23]:
findex = 11
df = pd.read_csv(os.path.join(case_sentence_csv_folder, csv_file_list[findex]))

In [4]:
print("columns:", df.columns)
print("total no. of cases:", len(df))

columns: Index(['Unnamed: 0', 'case_file', 'sentence', 'token_count'], dtype='object')
total no. of cases: 125490


## Create and Save Sentence Embeddings 

### Load Universal Sentence Encoder model from TF Hub

In [None]:
import tensorflow_hub as hub

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model_name = 'uni_sent_encoder'

In [None]:
model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder-large/5 loaded


In [None]:
def get_embed(input):
  """ input: list of string sentences """
  return model(input).numpy()

In [None]:
sample = ["I am a sentence for which I would like to get its embedding."]
sample_embed = get_embed(sample)

In [None]:
print("shape:", sample_embed.shape)
print("dtype:", sample_embed.dtype)

shape: (1, 512)
dtype: float32


### Sbert

In [5]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 31.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 33.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 33.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
model_name = 'sentence-transformers/paraphrase-distilroberta-base-v1'
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
vec = model.encode("London is known for its finacial district")
vec.shape

(768,)

In [9]:
def get_embed(sentences):
  return model.encode(sentences, convert_to_tensor=False)

### generate embeddings

In [None]:
from ast import literal_eval
import numpy as np

In [None]:
embed_dim = 512
seq_len = 150

In [None]:
docs = np.empty((0, seq_len, embed_dim), dtype=np.float32)
labels = []
start_ind = 0
end_ind = 10

In [None]:
for i in range(start_ind, end_ind):
  sent_list = literal_eval(cases_df['sentences'][i])[:seq_len]
  sents_np = np.empty((0, embed_dim), dtype=np.float32)
  for sent in sent_list:
    # sent_embed = nlp(sent).vector
    sent_embed = get_embed([sent])
    # sents_np = np.append(sents_np, sent_embed[np.newaxis, :], axis=0)
    sents_np = np.append(sents_np, sent_embed, axis=0)

  # print("sentence count:", sents_np.shape[0])
  if sents_np.shape[0] < seq_len:
    padding = np.zeros((seq_len - sents_np.shape[0], embed_dim), dtype=np.float32)
    sents_np = np.append(sents_np, padding, axis=0)
  docs = np.append(docs, sents_np[np.newaxis, :, :], axis=0)
  labels.append(int(cases_df['Y'][i]))

In [None]:
print("docs shape:", docs.shape)
print("docs Size:", docs.nbytes/(1024*1024), "MB")

docs shape: (10, 150, 512)
docs Size: 2.9296875 MB


In [None]:
labels_np = np.array(labels, dtype=np.int32)
labels_np.shape

(10,)

### sentence vectors for new dataset

In [10]:
min_sentences = 20
max_sentences = 250

In [24]:
sent_vecs_folder = f'/content/case_{(findex+1)*1000}'
!mkdir {sent_vecs_folder}

In [25]:
groups = df.groupby('case_file')

In [26]:
groups.size()

case_file
case13508.txt    174
case13510.txt     19
case13511.txt     78
case13512.txt     47
case13513.txt    289
                ... 
case14715.txt     82
case14716.txt     32
case14717.txt    117
case14718.txt     78
case14719.txt    173
Length: 994, dtype: int64

In [27]:
rows = []
for grp_name, group in groups:
  # print("Group:", grp_name)
  # print("size:", group.shape)
  rows.append(group.shape[0])

print("Average number of sentences in a case :", sum(rows) / len(rows))

Average number of sentences in a case : 124.89839034205231


In [28]:
i = 0
for r in rows:
  if r >= min_sentences and r <= max_sentences:
    i+=1

print("non-outlier case count :", i)

non-outlier case count : 864


In [29]:
groups.ngroups

994

In [30]:
for grp_name, group in tqdm(groups, total=groups.ngroups):
  # print("Group:", grp_name)
  if group.shape[0] < min_sentences or group.shape[0] > max_sentences: continue
  sent_vecs = []
  for index, row in group.iterrows():
    sent_vecs.append(get_embed(row['sentence']))
  sents_np = np.stack(sent_vecs, axis=0)
  # if sents_np.shape[0] < seq_len:
  #   padding = np.zeros((seq_len - sents_np.shape[0], embed_dim), dtype=np.float32)
  #   sents_np = np.append(sents_np, padding, axis=0)
  case_file = f"{grp_name.split('.')[0]}.npy"
  fpath = os.path.join(sent_vecs_folder, case_file)
  # np.save(fpath, sents_np)
  with open(fpath, 'wb') as f:
    np.save(f, sents_np)

100%|██████████| 994/994 [15:29<00:00,  1.07it/s]


In [31]:
fpath

'/content/case_12000/case14719.npy'

In [32]:
with open(fpath, 'rb') as f:
  vecs = np.load(f)
print(vecs.shape)
print(vecs.dtype)

(173, 768)
float32


In [33]:
len(os.listdir(sent_vecs_folder))

864

In [21]:
drive_path = '/content/drive/Shareddrives/SigmaLaw-WPP/criminal_sentence_dataset/sentence_embeddings/paraphrase-distilroberta-base-v1'

In [None]:
if not os.path.exists(drive_path):
  os.makedirs(drive_path)

In [34]:
!cp -r {sent_vecs_folder} {drive_path}

### Save npz file

In [None]:
model_type = "uni_sent_enc_512d"

In [None]:
npz_file = f"/content/{model_type}_{end_ind}.npz"
npz_file

'/content/roberta_large_11413.npz'

In [None]:
np.savez_compressed(npz_file, x=docs, y=labels_np)

In [None]:
copy_path = f"/content/drive/Shareddrives/SigmaLaw-WPP/embeddings/{model_type}"

In [None]:
!cp {npz_file} {copy_path}