# Active Learning

In [1]:
#%load_ext autoreload
#%autoreload 2

GLB_USE_DRIVE_ACCOUNT = True
GLB_INSTALL_DEPENDENCIES = True

if GLB_USE_DRIVE_ACCOUNT:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/"Colab Notebooks"
  username = "IsaacOlguin"
  repository =  "AutomatedTraumaDetectionInGCT"
  %cd {repository}
  %pwd

if GLB_INSTALL_DEPENDENCIES:
    !pip install transformers
    #!pip install torch
    #!pip install openpyxl

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
/content/drive/MyDrive/Colab Notebooks/AutomatedTraumaDetectionInGCT
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers

## Imports

In [2]:
import yaml
import pandas as pd
#import torch
import numpy as np
import matplotlib.pyplot as plt
#import json
import os
from os.path import join

## Globals and set-up

In [3]:
####### Globals
GLB_DEFINE_PATH_PROJECT = False
PATH_PROJECT = ""

if GLB_DEFINE_PATH_PROJECT:
    PATH_PROJECT = "/content/drive/MyDrive/Colab Notebooks/AutomatedTraumaDetectionInGCT"
else:
    PATH_PROJECT = os.getcwd()

with open(join(PATH_PROJECT, "config.yml"), "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

PATH_DATASET = join( PATH_PROJECT, cfg["general_set_up"]["input_dir_name"], cfg["general_set_up"]["dataset_dir_name"], cfg["general_set_up"]["dataset_filename"] )
INDEX_COLUMNS_DATASET = cfg["dataset"]["index_columns_dataset"]
LIST_NAME_COLUMNS_DATASET = cfg["dataset"]["list_columns_names"]
COL_OF_INTEREST = cfg["dataset"]["col_of_interest_multi_label_classif"]

## Dataset

In [4]:
%reload_ext autoreload

import src.multi_label_classif_model_utilities as mlclassif_utilities

In [5]:
df_dataset = mlclassif_utilities.import_dataset_from_excel(PATH_DATASET, INDEX_COLUMNS_DATASET, LIST_NAME_COLUMNS_DATASET)
df_dataset.head()

Unnamed: 0,id_document,id_annotation,span,role,trauma,court
0,63a16b2597ad59b4cfc56c2d,63a16c4997ad59b4cfc56c2f,THE INTERNATIONAL CRIMINAL TRIBUNAL \t\tCASE N...,Court Proceedings,0,ICTY
1,63a16b2597ad59b4cfc56c2d,63a16c4e97ad59b4cfc56c30,"THE PRESIDING JUDGE: Miss Hollis, could you c...",JudgeProc,0,ICTY
2,63a16b2597ad59b4cfc56c2d,63a16c5297ad59b4cfc56c31,"MISS HOLLIS: Yes, your Honour.",LawyerProc,0,ICTY
3,63a16b2597ad59b4cfc56c2d,63a16c5697ad59b4cfc56c32,"Examined by MISS HOLLIS, continued.",Court Proceedings,0,ICTY
4,63a16b2597ad59b4cfc56c2d,63a16c5c97ad59b4cfc56c33,"THE PRESIDING JUDGE: You may be seated, Mrs. ...",JudgeQA,0,ICTY


In [6]:
NUM_CLASSES = df_dataset[COL_OF_INTEREST].nunique()
LIST_OF_CLASSES = df_dataset[COL_OF_INTEREST].unique()

print(f'Number of classes {NUM_CLASSES} and list of classes [{list(LIST_OF_CLASSES)}]')

Number of classes 7 and list of classes [['Court Proceedings', 'JudgeProc', 'LawyerProc', 'JudgeQA', 'LawyerQA', 'Witness', 'Accused']]


In [7]:
counts = df_dataset.groupby(COL_OF_INTEREST).count()["span"]
normalized = round(df_dataset[COL_OF_INTEREST].value_counts(normalize=True), 4)
percentages = normalized*100
df_stats_dataset = pd.DataFrame({'counts': counts, 'normalized': normalized, 'percentages': percentages})
df_stats_dataset

Unnamed: 0,counts,normalized,percentages
Accused,316,0.0247,2.47
Court Proceedings,461,0.0361,3.61
JudgeProc,1437,0.1125,11.25
JudgeQA,630,0.0493,4.93
LawyerProc,1220,0.0955,9.55
LawyerQA,3942,0.3085,30.85
Witness,4773,0.3735,37.35


In [68]:
def give_me_segments_of_df_per_class(df, number_of_splits, column_of_interest, column_of_reference):
  dict_of_segments = {}
  invalidSplit = False
  number_of_classes = df[column_of_interest].nunique()
  list_of_classes = df[column_of_interest].unique()

  counts = df[column_of_interest].value_counts()
  normalized = round(df[column_of_interest].value_counts(normalize=True), 4)
  percentages = normalized*100

  df_stats_dataset = pd.DataFrame({'counts': counts, 'normalized': normalized, 'percentages': percentages}).reset_index()

  # Validation
  for i, row in df_stats_dataset.iterrows():
    if row["counts"] < number_of_splits:
      print(f"ERROR - Dataset[{row['index']}] cannot be split into the given number of splits")
      invalidSplit = True
    #print(row["index"], row["counts"])
  
  if invalidSplit:
    return None
  else:
    # Get sizes of segments and put them into a list
    list_of_size_segments = (df_stats_dataset["counts"]-(df_stats_dataset["counts"]%number_of_splits)) / number_of_splits
    print(list_of_size_segments)

    print("*"*100)
    print(df_stats_dataset)
    print("*"*100)

    # Initialize dict_of_segments
    for i_range in range(0, number_of_splits):
      dict_of_segments[i_range] = pd.DataFrame()

    # Add segments to a list of segments
    for index_class, (size, type_id) in enumerate(zip(list_of_size_segments, df_stats_dataset["index"])):
      size = int(size)
      print(index_class, "#"*100, size)
      for i_range in range(0, number_of_splits):
        print(i_range, "*"*50, index_class, type_id, "Segment", i_range, "[", i_range*size, ":", i_range*size+size, "]")
        if index_class == 0:
          dict_of_segments[i_range] = df[df[COL_OF_INTEREST] == type_id][i_range*size:i_range*size+size]
        else:
          if (i_range+1) == number_of_splits:
            dict_of_segments[i_range] = pd.concat([dict_of_segments[i_range], df[df[COL_OF_INTEREST] == type_id][i_range*size:]])
          else:
            dict_of_segments[i_range] = pd.concat([dict_of_segments[i_range], df[df[COL_OF_INTEREST] == type_id][i_range*size:i_range*size+size]])

      #print("*"*50)

    return dict_of_segments

dict_of_segments = give_me_segments_of_df_per_class(df_dataset, 5, COL_OF_INTEREST, "span")

0    954.0
1    788.0
2    287.0
3    244.0
4    126.0
5     92.0
6     63.0
Name: counts, dtype: float64
****************************************************************************************************
               index  counts  normalized  percentages
0            Witness    4773      0.3735        37.35
1           LawyerQA    3942      0.3085        30.85
2          JudgeProc    1437      0.1125        11.25
3         LawyerProc    1220      0.0955         9.55
4            JudgeQA     630      0.0493         4.93
5  Court Proceedings     461      0.0361         3.61
6            Accused     316      0.0247         2.47
****************************************************************************************************
0 #################################################################################################### 954
0 ************************************************** 0 Witness Segment 0 [ 0 : 954 ]
1 ************************************************** 0 Witness Segme

In [69]:
dict_of_segments[4]["role"].value_counts()

Witness              954
LawyerQA             790
JudgeProc            289
LawyerProc           244
JudgeQA              126
Court Proceedings     93
Accused               64
Name: role, dtype: int64

In [70]:
dict_of_segments[3]["role"].value_counts()

Witness              954
LawyerQA             788
JudgeProc            287
LawyerProc           244
JudgeQA              126
Court Proceedings     92
Accused               63
Name: role, dtype: int64