In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from keras.models import Model, Sequential
from keras.layers import Input, Flatten, Dense, Concatenate, Dropout, LSTM, GRU, SimpleRNN, Embedding, Reshape
from keras.constraints import maxnorm

In [None]:
import pandas as pd
import sys
import os
from tqdm import trange
import numpy as np

In [None]:
case_sentence_csv_folder = '/content/drive/Shareddrives/SigmaLaw-WPP/criminal_sentence_dataset/'
csv_file_list = [
  'sentence_dataset_1000_cases.csv',
  'sentence_dataset_2000_cases.csv', 
  # 'sentence_dataset_3000_cases.csv', 
  # 'sentence_dataset_4000_cases.csv', 
  # 'sentence_dataset_5000_cases.csv',
  # 'sentence_dataset_6000_cases.csv', 
  # 'sentence_dataset_7000_cases.csv', 'sentence_dataset_8000_cases.csv', 
  # 'sentence_dataset_9000_cases.csv',
  # 'sentence_dataset_10000_cases.csv'
]

In [None]:
critical_sentence_csv_folder = '/content/drive/Shareddrives/SigmaLaw-WPP/criminal_sentence_dataset/critical_sentence_prediction/'
critical_file_list = [
    'critical_sentence_pred_results_1000.csv',
    'critical_sentence_pred_results_2000.csv',
    # 'critical_sentence_pred_results_3000.csv',
    # 'critical_sentence_pred_results_4000.csv',
    # 'critical_sentence_pred_results_5000.csv',
    # 'critical_sentence_pred_results_6000.csv'
                      ]

In [None]:
result_csv_path= '/content/drive/Shareddrives/SigmaLaw-WPP/LRR/web_scrape/criminal/decision_annotated_criminal_v1.csv'

In [None]:
result_df = pd.read_csv(result_csv_path)

# Data Preparation

# Dataset

In [None]:
k = 1000
np_encoded_sentences_list = np.array([])
while k <= 2000:
  path_2 = '/content/drive/Shareddrives/SigmaLaw-WPP/EAMS/datasets/encoded_sentences_768_{}.txt'.format(k)
  with open(path_2, 'r') as f:
    np_encoded_sentences_array = np.loadtxt(f)
    np_encoded_sentences_list = [*np_encoded_sentences_list, *np_encoded_sentences_array]
    print(k)
    k+=1000

1000
2000


In [None]:
critical_sentence_result = []
for findex in trange(len(critical_file_list)):
  df = pd.read_csv(os.path.join(critical_sentence_csv_folder, critical_file_list[findex]))
  print(critical_file_list[findex])
  for index, row in df.iterrows():
    critical_sentence_result.append([row['lose_negative'], row['lose_positive'], row['win_negative'], row['win_positive']])

  0%|          | 0/2 [00:00<?, ?it/s]

critical_sentence_pred_results_1000.csv


 50%|█████     | 1/2 [00:08<00:08,  8.06s/it]

critical_sentence_pred_results_2000.csv


100%|██████████| 2/2 [00:15<00:00,  7.65s/it]


In [None]:
cases = []
decisions = []
case_name = 'case0.txt'
case_sentences = []
m = 0
for findex in trange(len(csv_file_list)):
  df = pd.read_csv(os.path.join(case_sentence_csv_folder, csv_file_list[findex]))
  print(csv_file_list[findex])
  for index, row in df.iterrows():
    if case_name != row['case_file']:
      cases.append(case_sentences)
      case_sentences = []
      if(result_df.loc[result_df['Unnamed: 0']==case_name, 'Y'].values[0] == -1):
        decisions.append(0)
      elif(result_df.loc[result_df['Unnamed: 0']==case_name, 'Y'].values[0] == 0):
        decisions.append(-1)
      elif(result_df.loc[result_df['Unnamed: 0']==case_name, 'Y'].values[0] == 1):
        decisions.append(1)
      else:
        print(case_name)
      case_name = row['case_file']
    case_sentences.append([*np_encoded_sentences_list[m],*critical_sentence_result[m]])
    m+=1



  0%|          | 0/2 [00:00<?, ?it/s]

sentence_dataset_1000_cases.csv


 50%|█████     | 1/2 [00:19<00:19, 19.90s/it]

sentence_dataset_2000_cases.csv


100%|██████████| 2/2 [00:41<00:00, 20.55s/it]


In [None]:
import collections

collections.Counter(decisions)

Counter({0: 1459, 1: 539})

In [None]:
len(cases[0][-1])

772

In [None]:
np_encoded_sentences_list = None
critical_sentence_result =None

In [None]:
updated_cases = []
updated_decisions = []
zero_count = 0
one_count = 0
for j in range(len(decisions)):
  if(decisions[j] == 0 and zero_count <539):
    updated_cases.append(cases[j])
    updated_decisions.append(decisions[j])
    zero_count+=1
  elif(decisions[j] == 1 and one_count <539):
    updated_cases.append(cases[j])
    updated_decisions.append(decisions[j])
    one_count+=1
  if(one_count==539 and zero_count == 539):
    break

In [None]:
len(updated_decisions)

1078

In [None]:

cases = None
decisions = None

In [None]:
collections.Counter(updated_decisions)

Counter({0: 539, 1: 539})

In [None]:
len(updated_cases)

1000

In [None]:
pad_cases = []
for case in updated_cases:
  no_sentences = len(case)
  if(no_sentences > 150):
    pad_cases.append(case[0:150])
  elif(no_sentences == 150):
    pad_cases.append(case)
  else:
    for b in range(no_sentences,150):
      case.append(np.zeros(772))
    pad_cases.append(case)
    

# Model Implementation

# Training Data Prep

In [None]:
model_inputs = np.asarray(pad_cases)
updated_decisions = np.asarray(updated_decisions)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
x_train, x_test, y_train, y_test = train_test_split(model_inputs, updated_decisions, test_size=0.2)

In [None]:
collections.Counter(y_test)

Counter({0: 96, 1: 120})

# RNN Model

In [None]:
model = Sequential()
model.add(GRU(512, input_shape=(150,772),return_sequences=False))
# model.add(Dropout(0.3))
model.add(Dense(32,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, validation_data=(x_test,y_test), epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f97d067add0>

In [None]:
model.fit(x_train, y_train, validation_data=(x_test,y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4e5a8ae350>