In [1]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from transformers import XLNetTokenizer, TFXLNetModel



In [2]:
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc

In [4]:
root_path = ''
list_of_files = []
for root, dir, files in os.walk('../data/Annotated - CSV'):
  root_path = root
  list_of_files = files
  break

#list_of_files = ['1314613428609.csv', '1522238936458.csv', '1564575450353.csv', '1409133457223.csv', '1372830044081.csv', '1291175003856_NEG.csv', '1290062946166.csv', '1315463402543.csv', '1324544561749.csv', '1358769139907.csv', '1404444629445.csv', '1427283185104.csv', '1380795608703.csv', '1404800940434.csv', '1382959468059.csv', '1289903641088.csv', '1407404311694.csv', '1494587603795.csv', '1289452697301.csv', '1509190608413.csv', '1372652426612.csv', '1288072330011.csv', '1407404413828.csv', '1404099510806.csv', '1290154724736_NEG.csv', '1407404374671.csv', '1312279794560.csv', '1312280268805.csv', '1288673034598.csv']

print(root_path)
print(list_of_files)

all_dataframes = []
for filename in list_of_files:
  file_path = root_path + '/' + filename
  file_temp = pd.read_csv(file_path)
  all_dataframes.append(file_temp)

result = pd.DataFrame()
result = result.append(all_dataframes,ignore_index=True)

train = pd.DataFrame(columns=result.columns)
test = pd.DataFrame(columns=result.columns)

print("lul", result.columns)

for label in result.Label.unique():
  temp_df = result[result['Label'] == label]
  train_index = int(temp_df.shape[0]*0.85)
  train = train.append(temp_df[:train_index])
  test = test.append(temp_df[train_index:])

../data/Annotated - CSV
['1288072330011.csv', '1288673034598.csv', '1289452697301.csv', '1289903641088.csv', '1290062946166.csv', '1290154724736_NEG.csv', '1291175003856_NEG.csv', '1312280268805.csv', '1314613428609.csv', '1315463402543.csv', '1324544561749.csv', '1358769139907.csv', '1372830044081.csv', '1380795608703.csv', '1382959468059.csv', '1404099510806.csv', '1404444629445.csv', '1404800940434.csv', '1407404311694.csv', '1407404374671.csv', '1407404413828.csv', '1409133457223.csv', '1427283185104.csv', '1494587603795.csv', '1509190608413.csv', '1522238936458.csv', '1564575450353.csv']
lul Index(['Sentence ID', 'Label', 'Sentence'], dtype='object')


In [5]:
train.shape

(1800, 3)

In [6]:
#preprocessing
class Classifier:
  DATA_COLUMN = "Sentence"
  LABEL_COLUMN = "Label"

  def __init__(self, train, test, tokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes

    train, test = map(lambda df: df.reindex(df[Classifier.DATA_COLUMN].str.len().sort_values().index), [train, test])

    ((self.train_x_input_ids, self.train_x_type_ids, self.train_x_attention_mask, self.train_y), (self.test_x_input_ids, self.test_x_type_ids, self.test_x_attention_mask, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x_input_ids, self.train_x_type_ids, self.train_x_attention_mask, self.test_x_input_ids, self.test_x_type_ids, self.test_x_attention_mask = map(self._pad, [self.train_x_input_ids, self.train_x_type_ids, self.train_x_attention_mask, self.test_x_input_ids, self.test_x_type_ids, self.test_x_attention_mask])

  def _prepare(self, df):
    x_input_ids, x_type_ids, x_attention_mask, y = [], [], [], []

    for _, row in tqdm(df.iterrows()):
      text, label = row[Classifier.DATA_COLUMN], row[Classifier.LABEL_COLUMN]
      output = self.tokenizer(text, return_tensors="tf")
      x_input_ids.append(np.array(output['input_ids'][0]))
      x_type_ids.append(np.array(output['token_type_ids'][0]))
      x_attention_mask.append(np.array(output['attention_mask'][0]))
      self.max_seq_len = max(self.max_seq_len, len(output['input_ids'][0]))
      y.append(self.classes.index(label))

    return np.array(x_input_ids), np.array(x_type_ids), np.array(x_attention_mask), np.array(y)

  def _pad(self, ids):
    print(ids.shape)
    x = []
    for input_ids in ids:
      # print(input_ids)
      input_ids = list(input_ids[:min(len(input_ids), self.max_seq_len - 2)])
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [7]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [9]:
def create_model(max_seq_len, classes):

  xlnet = TFXLNetModel.from_pretrained('xlnet-base-cased')
  
  input_word_ids = keras.layers.Input(shape=(max_seq_len, ), dtype="int32", name="input_word_ids")
  input_type_ids = keras.layers.Input(shape=(max_seq_len, ), dtype="int32", name="input_type_ids")
  input_mask = keras.layers.Input(shape=(max_seq_len, ), dtype="int32", name="input_mask")
  
  xlnet_output = xlnet(input_ids=input_word_ids, token_type_ids=input_type_ids, attention_mask=input_mask)[0]
  # bert_output = bert(input_ids)

  print("bert shape", xlnet_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(xlnet_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=len(classes), activation="softmax")(cls_out)

  model = keras.Model(inputs=[input_word_ids, input_type_ids, input_mask], outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  # load_stock_weights(bert, bert_ckpt_file)
        
  return model

In [10]:
classes = train.Label.unique().tolist()
for num,classname in enumerate(classes):
  print(num,classname,end='\n')

0 material fact
1 procedural fact
2 allegation
3 defendant claim
4 issues framed
5 statutory fact
6 subjective observation
7 violation
8 penalty
9 related fact


In [11]:
data = Classifier(train, test, tokenizer, classes, max_seq_len=300)

1800it [00:02, 792.59it/s]
  return np.array(x_input_ids), np.array(x_type_ids), np.array(x_attention_mask), np.array(y)
324it [00:00, 1109.57it/s]


max seq_len 319
(1800,)
(1800,)
(1800,)
(324,)
(324,)
(324,)


In [12]:
model = create_model(data.max_seq_len, classes)

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
bert shape (None, 300, 768)


In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 300)]        0                                            
__________________________________________________________________________________________________
tfxl_net_model (TFXLNetModel)   TFXLNetModelOutput(l 116718336   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [14]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [15]:
# from keras.callbacks import EarlyStopping, ModelCheckpoint
# import datetime

# logdir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# my_callbacks = [EarlyStopping(patience=2, monitor="val_acc"),  ModelCheckpoint(filepath='model_4{epoch:02d}.hdf5', save_best_only=True, save_weights_only = False, monitor='val_loss', mode='auto',save_freq = 'epoch'), keras.callbacks.TensorBoard(log_dir=logdir)]

In [16]:
history = model.fit(
  x=[data.train_x_input_ids, data.train_x_type_ids, data.train_x_attention_mask],
  y=data.train_y,
  validation_split=0.1,
  batch_size=1,
  shuffle=True,
  epochs = 4
)

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4
  20/1620 [..............................] - ETA: 5:30 - loss: 0.6210 - acc: 0.8000

KeyboardInterrupt: 

In [17]:
_, train_acc = model.evaluate([data.train_x_input_ids, data.train_x_type_ids, data.train_x_attention_mask], data.train_y)
_, test_acc = model.evaluate([data.test_x_input_ids, data.test_x_type_ids, data.test_x_attention_mask], data.test_y)

print("train acc", train_acc)
print("test acc", test_acc)

train acc 0.8894444704055786
test acc 0.6604938507080078


In [18]:
y_pred = model.predict([data.test_x_input_ids, data.test_x_type_ids, data.test_x_attention_mask]).argmax(axis=-1)



In [19]:
print(classification_report(data.test_y, y_pred, target_names=classes))

                        precision    recall  f1-score   support

         material fact       0.73      0.58      0.65        77
       procedural fact       0.75      0.96      0.84        45
            allegation       0.45      0.83      0.59        12
       defendant claim       0.57      0.63      0.60        84
         issues framed       1.00      0.64      0.78        14
        statutory fact       0.88      0.78      0.82        27
subjective observation       0.65      0.68      0.67        41
             violation       0.00      0.00      0.00         5
               penalty       0.45      0.83      0.59         6
          related fact       0.00      0.00      0.00        13

              accuracy                           0.66       324
             macro avg       0.55      0.59      0.55       324
          weighted avg       0.65      0.66      0.65       324



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
