## Imports & Setup

In [27]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Bidirectional

In [None]:
import re

def atoi(text):
  return int(text) if text.isdigit() else text

def natural_keys(text):
  '''
  alist.sort(key=natural_keys) sorts in human order
  http://nedbatchelder.com/blog/200712/human_sorting.html
  (See Toothy's implementation in the comments)
  '''
  return [ atoi(c) for c in re.split(r'(\d+)', text) ]

## Load Dataset

In [None]:
""" .npz file containing keys: 
train_input: (num_docs, num_sentences, embed_dim)
train_labels: (num_docs)
val_input: (num_docs, num_sentences, embed_dim)
val_labels: (num_docs)
test_input: (num_docs, num_sentences, embed_dim)
test_labels: (num_docs)
"""
dataset_file = ""

In [None]:
with np.load(dataset_file) as dataset:
  train_input = dataset['train_input']
  train_labels = dataset['train_labels']
  val_input = dataset['val_input']
  val_labels = dataset['val_labels']
  test_input = dataset['test_input']
  test_labels = dataset['test_labels']

print("train_input shape: ", train_input.shape)
print("val_input shape: ", val_input.shape)
print("test_input shape: ", test_input.shape)

In [None]:
file_name_tails = ['1000', '2000', '3000', '4000', '5000', '6000', '7000', '8000']
embedding_model = 'roberta_large'
embeddings_path = "/content/drive/Shareddrives/SigmaLaw-WPP/embeddings/roberta_large_1024"

In [None]:
seq_len = 150
embed_dim = 1024

In [None]:
train_input = np.empty((0, seq_len, embed_dim), dtype=np.float32)
train_labels = np.empty((0,), dtype=np.int32)

In [None]:
for tail in file_name_tails:
  fpath = f"{embeddings_path}/{embedding_model}_{tail}.npz"
  with np.load(fpath) as dataset:
    train_input = np.append(train_input, dataset['x'], axis=0)
    train_labels = np.append(train_labels, dataset['y'], axis=0)

print("train_input shape: ", train_input.shape)
print("train_labels shape: ", train_labels.shape)

train_input shape:  (8000, 150, 1024)
train_labels shape:  (8000,)


In [None]:
val_data_file = "/content/drive/Shareddrives/SigmaLaw-WPP/embeddings/roberta_large_1024/roberta_large_9000.npz"

In [None]:
with np.load(val_data_file) as dataset:
  val_input = dataset['x']
  val_labels = dataset['y']

print("val_input shape: ", val_input.shape)

val_input shape:  (1000, 150, 1024)


In [None]:
train_input.nbytes/(1024*1024)

4687.5

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_input, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_input, val_labels))

In [None]:
BATCH_SIZE = 8
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

## Load Case Decisions csv

In [2]:
case_decisions_csv = '/content/drive/Shareddrives/SigmaLaw-WPP/LRR/web_scrape/criminal/decision_annotated_criminal_v1.csv'

In [3]:
decisions_df = pd.read_csv(case_decisions_csv)
decisions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15902 entries, 0 to 15901
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  15902 non-null  object
 1   decision    15902 non-null  object
 2   Y           15902 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 372.8+ KB


In [7]:
decisions_df.index[decisions_df['Unnamed: 0'] == 'case2.txt'].tolist()

[2]

## Load case sentence npy files

In [47]:
embed_folder = '/content/drive/Shareddrives/SigmaLaw-WPP/criminal_sentence_dataset/sentence_embeddings/paraphrase-distilroberta-base-v1'
sub_folders = os.listdir(embed_folder)
sub_folders.sort(key=natural_keys)

In [48]:
seq_len = 150

In [49]:
petitioner_lose_cases = []
petitioner_win_cases = []
for folder in tqdm(sub_folders, total=len(sub_folders)):
  npy_files = os.listdir(os.path.join(embed_folder, folder))
  npy_files.sort(key=natural_keys)
  for fname in npy_files:
    txt_name = fname.replace('npy', 'txt')
    row_index = decisions_df.index[decisions_df['Unnamed: 0'] == txt_name].tolist()[0]
    decision_label = decisions_df.at[row_index, 'Y']
    
    fpath = os.path.join(embed_folder, folder, fname)
    with open(fpath, 'rb') as f:
      vecs = np.load(f) # (sentence_count, embed_dim)
    if vecs.shape[0] < seq_len:
      padding = np.zeros((seq_len - vecs.shape[0], vecs.shape[1]), dtype=np.float32)
      padded_vecs = np.append(vecs, padding, axis=0)
    else:
      padded_vecs = vecs[:seq_len, :]
    if decision_label == -1: petitioner_lose_cases.append(padded_vecs)
    elif decision_label == 1: petitioner_win_cases.append(padded_vecs)

print("\npetitioner lose cases count :", len(petitioner_lose_cases))
print("petitioner win cases count :", len(petitioner_win_cases))

100%|██████████| 8/8 [00:47<00:00,  5.89s/it]


petitioner lose cases count 5156
petitioner win cases count 1780





## TF Dataset

In [61]:
train_set_ratio = 0.8
val_set_ratio = 0.1

BATCH_SIZE = 32

class_len = min(len(petitioner_lose_cases), len(petitioner_win_cases))
train_set_size_per_class = int(train_set_ratio * class_len)
val_set_size_per_class = int(val_set_ratio * class_len)

In [62]:
train_samples = petitioner_lose_cases[:train_set_size_per_class] + petitioner_win_cases[:train_set_size_per_class]

val_start = train_set_size_per_class
val_end = train_set_size_per_class + val_set_size_per_class

val_samples = petitioner_lose_cases[val_start:val_end] + petitioner_win_cases[val_start:val_end]

In [63]:
train_labels = np.concatenate(
    (np.zeros((train_set_size_per_class), dtype=np.int32), np.ones((train_set_size_per_class), dtype=np.int32)),
    axis=0
)

val_labels = np.concatenate(
    (np.zeros((val_set_size_per_class), dtype=np.int32), np.ones((val_set_size_per_class), dtype=np.int32)),
    axis=0
)

In [78]:
train_ds = tf.data.Dataset.from_tensor_slices((train_samples, train_labels))
val_ds = tf.data.Dataset.from_tensor_slices((val_samples, val_labels))

train_ds = train_ds.shuffle(train_set_size_per_class*2, seed=356).batch(BATCH_SIZE)
val_ds = val_ds.batch(BATCH_SIZE)

In [81]:
for samples, labels in train_ds.take(1):
  print('samples.shape :', samples.shape)
  print('labels.shape :', labels.shape)

samples.shape : (32, 150, 768)
labels.shape : (32,)


In [71]:
labels

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0], dtype=int32)>

## Train Model

In [92]:
# 'LSTM' or 'GRU'
CELL_TYPE = "LSTM"

output_dim = 256
seq_len = 150
embed_dim = 768
dense1_d = 256
dense2_d = 32
last_layer_d = 1
dropout_rate = 0.1

EPOCHS = 5

In [93]:
model = tf.keras.Sequential()
if CELL_TYPE == "LSTM":
  # model.add(Bidirectional(LSTM(output_dim, input_shape=(seq_len, embed_dim), return_sequences=True)))
  # model.add(Bidirectional(LSTM(output_dim)))
  model.add(LSTM(output_dim, recurrent_dropout=0.2, dropout=0.2, input_shape=(seq_len, embed_dim)))
elif CELL_TYPE == "GRU":
  model.add(Bidirectional(GRU(output_dim, input_shape=(seq_len, embed_dim), return_sequences=True)))
  model.add(Bidirectional(GRU(output_dim)))
else:
  raise ValueError("CELL_TYPE should be `LSTM` or `GRU`")

# model.add(Dense(dense1_d))
# model.add(Dropout(dropout_rate))
model.add(Dense(dense2_d))
model.add(Dropout(dropout_rate))
if last_layer_d == 1:
  model.add(Dense(last_layer_d, activation="sigmoid"))
else:
  model.add(Dense(last_layer_d, activation="softmax"))

In [94]:
if last_layer_d == 1:
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
else:
  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [95]:
model.build((None, seq_len, embed_dim))

In [96]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 256)               1049600   
                                                                 
 dense_2 (Dense)             (None, 32)                8224      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,057,857
Trainable params: 1,057,857
Non-trainable params: 0
_________________________________________________________________


In [88]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 512)               2623488   
                                                                 
 dense (Dense)               (None, 32)                16416     
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,639,937
Trainable params: 2,639,937
Non-trainable params: 0
_________________________________________________________________


In [97]:
ckpt_folder = '/content/checkpoints_lstm_256'
!mkdir {ckpt_folder}

In [100]:
checkpoint_filepath = ckpt_folder + '/ckpt-{epoch:04d}'

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    verbose=0
)

In [None]:
model.fit(train_ds, epochs=EPOCHS, callbacks=[cp_callback], validation_data=val_ds)

Epoch 1/5

Epoch 00001: saving model to /content/checkpoints/ckpt-0001
Epoch 2/5

Epoch 00002: saving model to /content/checkpoints/ckpt-0002
Epoch 3/5

Epoch 00003: saving model to /content/checkpoints/ckpt-0003
Epoch 4/5

Epoch 00004: saving model to /content/checkpoints/ckpt-0004
Epoch 5/5

Epoch 00005: saving model to /content/checkpoints/ckpt-0005


<tensorflow.python.keras.callbacks.History at 0x7f4f049322d0>

---

In [91]:
model.fit(train_ds, epochs=EPOCHS, callbacks=[cp_callback], validation_data=val_ds)

Epoch 1/5
Epoch 1: saving model to /content/checkpoints/ckpt-0001
Epoch 2/5
Epoch 2: saving model to /content/checkpoints/ckpt-0002
Epoch 3/5
Epoch 3: saving model to /content/checkpoints/ckpt-0003
Epoch 4/5
Epoch 4: saving model to /content/checkpoints/ckpt-0004
Epoch 5/5
Epoch 5: saving model to /content/checkpoints/ckpt-0005


<keras.callbacks.History at 0x7f6fc3c9bb50>

In [101]:
model.fit(train_ds, epochs=EPOCHS, callbacks=[cp_callback], validation_data=val_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6fbec3bf10>

## Save Model

In [None]:
!ls /content/checkpoints/

checkpoint		       ckpt-0003.index
ckpt-0001.data-00000-of-00001  ckpt-0004.data-00000-of-00001
ckpt-0001.index		       ckpt-0004.index
ckpt-0002.data-00000-of-00001  ckpt-0005.data-00000-of-00001
ckpt-0002.index		       ckpt-0005.index
ckpt-0003.data-00000-of-00001


In [None]:
saved_model_path = "/content/lstm_512_epoch_05/"
!mkdir $saved_model_path
# !mkdir {saved_model_path}

In [None]:
checkpoint_number = '0005'

In [None]:
# model.save(data_path/ "LSTM_512_1000samples")

if checkpoint_number != None:
  model.load_weights("/content/checkpoints/ckpt-{}".format(checkpoint_number))

model.save(saved_model_path)



INFO:tensorflow:Assets written to: /content/lstm_512_epoch_05/assets


INFO:tensorflow:Assets written to: /content/lstm_512_epoch_05/assets


In [None]:
%cp -r /content/checkpoints/ /content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512

'/content/checkpoints/' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints'
'/content/checkpoints/ckpt-0001.index' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints/ckpt-0001.index'
'/content/checkpoints/ckpt-0003.index' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints/ckpt-0003.index'
'/content/checkpoints/ckpt-0002.index' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints/ckpt-0002.index'
'/content/checkpoints/ckpt-0004.index' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints/ckpt-0004.index'
'/content/checkpoints/ckpt-0005.index' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints/ckpt-0005.index'
'/content/checkpoints/checkpoint' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/checkpoints/checkpoint'
'/content/checkpoints/ckpt-0001.data-00000-of-00001' -> '/content/drive/Shareddriv

In [None]:
%cp -r /content/lstm_512_epoch_05/ /content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512

'/content/lstm_512_epoch_05/' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/lstm_512_epoch_05'
'/content/lstm_512_epoch_05/variables' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/lstm_512_epoch_05/variables'
'/content/lstm_512_epoch_05/variables/variables.index' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/lstm_512_epoch_05/variables/variables.index'
'/content/lstm_512_epoch_05/variables/variables.data-00000-of-00001' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/lstm_512_epoch_05/variables/variables.data-00000-of-00001'
'/content/lstm_512_epoch_05/saved_model.pb' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/lstm_512_epoch_05/saved_model.pb'
'/content/lstm_512_epoch_05/keras_metadata.pb' -> '/content/drive/Shareddrives/SigmaLaw-WPP/win_pred_model/lstm_512/lstm_512_epoch_05/keras_metadata.pb'
'/content/lstm_512_epoch_05/assets' -> '/content/drive/Shareddrives/SigmaLaw-WP

## Evaluation

In [None]:
checkpoint_number = '0020'
model.load_weights("/content/checkpoints/ckpt-{}".format(checkpoint_number))

In [None]:
pred_results = []
for i in range(test_input.shape[0]):
  output = model.predict(test_input[i:i+1, :, :])
  if output.shape[1] == 1:
    # pred_results.append(output[0][0])
    if output[0][0] >= 0.5:
      pred_results.append(1)
    else:
      pred_results.append(0)
  else:
    pred_results.append(np.argmax(output[0]))

pred_results = np.array(pred_results)

## Other

In [None]:
fpath = os.path.join(embed_folder, sub_folders[0], 'case2.npy')

In [None]:
os.path.exists(fpath)

True

In [None]:
with open(fpath, 'rb') as f:
  vecs = np.load(f)

In [None]:
vecs.shape

(121, 768)

In [None]:
padding = np.zeros((seq_len - vecs.shape[0], vecs.shape[1]), dtype=np.float32)
padding.shape

(29, 768)

In [None]:
padded_v = np.append(vecs, padding, axis=0)

In [None]:
padded_v.shape

(150, 768)

In [None]:
natural_keys('case_56.txt')

['case_', 56, '.txt']

In [None]:
l = ['case_9982.txt', 'case_9987.txt', 'case_10005.txt', 'case_10001.txt', 'case_9995.txt']
l.sort(key=natural_keys)
l

['case_9982.txt',
 'case_9987.txt',
 'case_9995.txt',
 'case_10001.txt',
 'case_10005.txt']