In [None]:
!pip install tensorflow-gpu 


In [None]:
!pip install tqdm 
!pip install bert-for-tf2 
!pip install sentencepiece 


In [2]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer


In [3]:
from sklearn.metrics import confusion_matrix, classification_report

In [4]:

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc


In [None]:
train = pd.read_csv("/content/drive/MyDrive/SEBI /Adjudication Orders Annotations JSON/Model Data CSV/train_5.csv")
valid = pd.read_csv("/content/drive/MyDrive/SEBI /Adjudication Orders Annotations JSON/Model Data CSV/valid_5.csv")
test = pd.read_csv("/content/drive/MyDrive/SEBI /Adjudication Orders Annotations JSON/Model Data CSV/test_5.csv")

In [None]:
train = train.append(valid).reset_index(drop=True)


In [None]:
print(train.shape)
train = train.drop(['Unnamed: 0', 'Sentence ID'],axis = 1)
print(train.shape)
train.head()

(1834, 4)
(1834, 2)


Unnamed: 0,Label,Sentence
0,material fact,Securities and Exchange Board of India (herein...
1,material fact,The Investigating Authority observed that M s ...
2,material fact,"Out of the said shares, the Noticee on Decembe..."
3,material fact,"Further, the Noticee sold the remaining 1,60,5..."
4,material fact,"The Noticee on December 11, 2009 transferred s..."


In [None]:
train = train[['Sentence','Label']]

In [None]:
train.head()

Unnamed: 0,Sentence,Label
0,Securities and Exchange Board of India (herein...,material fact
1,The Investigating Authority observed that M s ...,material fact
2,"Out of the said shares, the Noticee on Decembe...",material fact
3,"Further, the Noticee sold the remaining 1,60,5...",material fact
4,"The Noticee on December 11, 2009 transferred s...",material fact


In [None]:
test = test.drop(['Unnamed: 0', 'Sentence ID'],axis = 1)
test = test[['Sentence','Label']]

In [None]:
test.head()
len(test)

471

In [None]:
train.head()


Unnamed: 0,Sentence,Label
0,Securities and Exchange Board of India (herein...,material fact
1,The Investigating Authority observed that M s ...,material fact
2,"Out of the said shares, the Noticee on Decembe...",material fact
3,"Further, the Noticee sold the remaining 1,60,5...",material fact
4,"The Noticee on December 11, 2009 transferred s...",material fact


In [None]:
# loading validation data if you want to merge original data with validated data and train the model 
''''
vroot_path = ''
vlist_of_files = []
for root, dir,files in os.walk('/content/drive/MyDrive/SEBI /Adjudication Orders Annotations JSON/Model Data Validation Context'):
  vroot_path = root
  vlist_of_files = files

vall_dataframes = []
for filename in vlist_of_files:
  file_path = vroot_path + '/' + filename
  file_temp = pd.read_csv(file_path)
  vall_dataframes.append(file_temp)


vresult = pd.DataFrame()
vresult = vresult.append(vall_dataframes,ignore_index=True)

vtrain_data = pd.DataFrame(columns= vresult.columns)
vtest_data = pd.DataFrame(columns=vresult.columns)
for label in vresult.Label.unique():
  vtemp_df = vresult[vresult['Label'] == label]
  # 79.3% train ; rest is test ; train is split into validation further 
  vtrain_index = int(vtemp_df.shape[0]*0.793)
  vtrain_data = vtrain_data.append(vtemp_df[:vtrain_index])
  vtest_data = vtest_data.append(vtemp_df[vtrain_index:])

vtrain_data.drop(['Sentence ID','Unnamed: 0'],axis=1,inplace=True)
vtest_data.drop(['Sentence ID','Unnamed: 0'],axis=1,inplace=True)
train = train.append(vtrain_data,ignore_index=True)
test = test.append(vtest_data,ignore_index=True)
'''

In [None]:
train.shape

(2335, 2)

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip


--2021-04-05 06:22:55--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.1.128, 142.250.103.128, 142.250.128.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.1.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip.1’


2021-04-05 06:22:58 (157 MB/s) - ‘uncased_L-12_H-768_A-12.zip.1’ saved [407727028/407727028]



In [None]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
replace uncased_L-12_H-768_A-12/bert_model.ckpt.meta? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
os.makedirs("model", exist_ok=True)


In [None]:
!ls

drive  sample_data		uncased_L-12_H-768_A-12.zip
model  uncased_L-12_H-768_A-12	uncased_L-12_H-768_A-12.zip.1


In [None]:
!mv uncased_L-12_H-768_A-12/ model


In [None]:
!ls

drive  sample_data		    uncased_L-12_H-768_A-12.zip.1
model  uncased_L-12_H-768_A-12.zip


In [None]:

bert_model_name="uncased_L-12_H-768_A-12"

bert_ckpt_dir = os.path.join("model/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

In [None]:
#preprocessing 
class Classifier:
  DATA_COLUMN = "Sentence"
  LABEL_COLUMN = "Label"

  def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    
    train, test = map(lambda df: df.reindex(df[Classifier.DATA_COLUMN].str.len().sort_values().index), [train, test])
    
    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[Classifier.DATA_COLUMN], row[Classifier.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [None]:
os.makedirs("/content/drive/MyDrive/SEBI /best_models/", exist_ok=True)


In [None]:
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))


In [None]:
#tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
#tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def create_model(max_seq_len, bert_ckpt_file):

  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=len(classes), activation="softmax")(cls_out)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)
        
  return model

In [None]:

classes = train.Label.unique().tolist()
for num,classname in enumerate(classes):
  print(num,classname,end='\n')

0 material fact
1 violation
2 procedural fact
3 allegation
4 issues framed
5 statutory fact
6 defendant claim
7 subjective observation
8 related fact
9 penalty
10 others


In [None]:
data = Classifier(train, test, tokenizer, classes, max_seq_len=128)


2335it [00:02, 995.36it/s]
609it [00:00, 1033.25it/s]


max seq_len 308


In [None]:
data.test_x.shape


(609, 128)

In [None]:
data.train_x[0]


array([ 101, 1049, 1055, 1012,  102,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [None]:
data.train_y[2]


10

In [None]:
model = create_model(data.max_seq_len, bert_ckpt_file)


bert shape (None, 128, 768)
Done loading 196 BERT weights from: model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f64e780dcd0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


In [None]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 128)]             0         
_________________________________________________________________
bert (BertModelLayer)        (None, 128, 768)          108890112 
_________________________________________________________________
lambda (Lambda)              (None, 768)               0         
_________________________________________________________________
dropout (Dropout)            (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 11)                8459      
Total params: 108,898,571
Trainable params: 108,898,571
Non-trainable params: 0
_________________________________________________________________


In [None]:

model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint 

my_callbacks = [EarlyStopping(patience=2, monitor="val_acc"),  ModelCheckpoint(filepath='/content/drive/MyDrive/SEBI /best_models/model_no_context_all.hdf5', save_best_only=True, save_weights_only = False, monitor='val_loss', mode='auto',save_freq = 'epoch')]



In [None]:


history = model.fit(
  x=data.train_x, 
  y=data.train_y,
  validation_split=0.1,
  batch_size=32,
  shuffle=True,
  epochs = 10,
  callbacks = my_callbacks
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [None]:
_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print("test acc", test_acc)

In [None]:
model = 

In [None]:
y_pred = model.predict(data.test_x).argmax(axis=-1)


In [None]:
print(classification_report(data.test_y, y_pred, target_names=classes))
