# Data preparation for finetuning the classifiers

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import openai
import os
import tiktoken


# Load the data
df = pd.read_json('../Data/cat_hearings_03_10_utterances_witnesses_MoCs_labels.json')
# Save the unlabelled data
df_unlabelled = df[df.labelled == False].reset_index(drop=True).copy()
# Subset labelled data 
df = df[df.labelled == True].reset_index(drop=True)
# Add a new dummy combining policy claim 4.4 and 4.5
df["L2dummy_4_45"] = df.apply(lambda x: [1.0 if i > 0 else 0.0 for i in [x.L2dummy_4_4 + x.L2dummy_4_5]][0], axis = 1)
# Add a new dummy combining policy claim 4.3 ,4.4, 4.5 and 4.6
df["L2dummy_4_3456"] = df.apply(lambda x: [1.0 if i > 0 else 0.0 for i in [x.L2dummy_4_3 + x.L2dummy_4_4 + x.L2dummy_4_5 + x.L2dummy_4_6]][0], axis = 1)
df.head(1)

Unnamed: 0,congress,chamber,committee,committee_short,year,date,title,hearing_id,type,last_name,...,L4claims,L4claims_multi_hot,L1policyclaims,L2policyclaims,L3policyclaims,L4policyclaims,L2policyclaims_multi_hot,labelled,L2dummy_4_45,L2dummy_4_3456
0,108,SENATE,Committee on Environment and Public Works,Environment and Public Works,2003,2003-04-08,The Clear Skies Act of 2003,108shrg91748,witness,Rogers,...,[0],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.0,[0.0],[0.0],[0.0],"[1, 0, 0, 0, 0, 0, 0, 0]",True,0.0,0.0


In [3]:
# Split the data into training, validation and testing data (70-15-15 split)

CLAIMS = ['L2dummy_4_1', 'L2dummy_4_2', 'L2dummy_4_3456']

# Create stratified random train, val and test data sets

df["CLAIMS"] = df.loc[:,CLAIMS].values.tolist()

# Reserve a testing data set of only the randomly sampled labelled data (First batch)
train, test = train_test_split(df[df.batch == "Batch 1: Random sample"],
                               test_size=0.364,
                               random_state=12, 
                               stratify = df[df.batch == "Batch 1: Random sample"].CLAIMS, 
                               shuffle=True)

# Merge the active learning batches with the remaining random sample training data
train = pd.concat([train, df[df.batch != "Batch 1: Random sample"]])

# Split the training data into training and validation sets
train, val = train_test_split(train, 
                              test_size=0.176, 
                              random_state=12,
                              stratify = train.CLAIMS, 
                              shuffle=True)

# Print the number or paragraphs in each data set
print("Training data set size: ", len(train))
print("Validation data set size: ", len(val))
print("Testing data set size: ", len(test))

# Transform the labels into the correct form (drop 4.7 as no observations in data)
train["labels"] = train.loc[:,CLAIMS].values.astype(int).tolist()
val["labels"] = val.loc[:,CLAIMS].values.astype(int).tolist()
test["labels"] = test.loc[:,CLAIMS].values.astype(int).tolist()


# Print the number of level 2 policy claims in each data set and the percentage of the total
print("Training data level 2 policy claims:\n")
for i in range(1, 3):
    print("Claim 4_{} count: ".format(i), train["L2dummy_4_{}".format(i)].value_counts()[1], "({}%)".format(round(train["L2dummy_4_{}".format(i)].value_counts()[1] / len(train) * 100, 2)))
print("Claim 4_3456 count:", train["L2dummy_4_3456"].value_counts()[1], "({}%)".format(round(train["L2dummy_4_3456"].value_counts()[1] / len(train) * 100, 2)))
print()
print("Validation data level 2 policy claims:")
for i in range(1, 3):
    print("Claim 4_{} count: ".format(i), val["L2dummy_4_{}".format(i)].value_counts()[1], "({}%)".format(round(val["L2dummy_4_{}".format(i)].value_counts()[1] / len(val) * 100, 2)))
print("Claim 4_3456 count:", val["L2dummy_4_3456"].value_counts()[1], "({}%)".format(round(val["L2dummy_4_3456"].value_counts()[1] / len(val) * 100, 2)))
print()
print("Testing data level 2 policy claims:")
for i in range(1, 3):
    print("Claim 4_{} count: ".format(i), test["L2dummy_4_{}".format(i)].value_counts()[1], "({}%)".format(round(test["L2dummy_4_{}".format(i)].value_counts()[1] / len(test) * 100, 2)))
print("Claim 4_3456 count:", test["L2dummy_4_3456"].value_counts()[1], "({}%)".format(round(test["L2dummy_4_3456"].value_counts()[1] / len(test) * 100, 2)))


# Note, there are no occurences of the new claim 4_7: No need for more action

Training data set size:  1181
Validation data set size:  253
Testing data set size:  253
Training data level 2 policy claims:

Claim 4_1 count:  276 (23.37%)
Claim 4_2 count:  127 (10.75%)
Claim 4_3456 count: 169 (14.31%)

Validation data level 2 policy claims:
Claim 4_1 count:  60 (23.72%)
Claim 4_2 count:  27 (10.67%)
Claim 4_3456 count: 36 (14.23%)

Testing data level 2 policy claims:
Claim 4_1 count:  38 (15.02%)
Claim 4_2 count:  17 (6.72%)
Claim 4_3456 count: 35 (13.83%)


In [35]:
# Word-counts
print("TRAINING:")
print("Training word count: ", train.word_count.sum())

print("")
print("INFERENCE:")
print("FFI word count: ", df_unlabelled[df_unlabelled.witness_category == "Fossil Fuel Industry"].word_count.sum())
print("Carbon-intensive Industry word count: ", df_unlabelled[df_unlabelled.witness_category == "Carbon-intensive Industry"].word_count.sum())
print("Business & Services word count: ", df_unlabelled[df_unlabelled.witness_category == "Business & Services"].word_count.sum())

TRAINING:
Training word count:  83293

INFERENCE:
FFI word count:  154668
Carbon-intensive Industry word count:  163005
Business & Services word count:  130533


In [54]:
# Token counts
print("TRAINING:")
print("Training token count: ", 
      train.text.apply(lambda x: len(enc.encode(x))).sum() +
      train.shape[0])

print("")
print("INFERENCE:")
print("Unlabelled contrarian token count:",
       df_unlabelled[df_unlabelled.witness_contrarian == "Contrarian"].text.apply(lambda x: len(enc.encode(x))).sum() + 
       df_unlabelled[df_unlabelled.witness_contrarian == "Contrarian"].shape[0])
print("FFI token count:",
       df_unlabelled[(df_unlabelled.witness_category == "Fossil Fuel Industry") & (df_unlabelled.witness_contrarian != "Contrarian")].text.apply(lambda x: len(enc.encode(x))).sum() + 
       df_unlabelled[(df_unlabelled.witness_category == "Fossil Fuel Industry") & (df_unlabelled.witness_contrarian != "Contrarian")].shape[0])
print("Carbon-intensive Industry token count:", 
      df_unlabelled[(df_unlabelled.witness_category == "Carbon-intensive Industry") & (df_unlabelled.witness_contrarian != "Contrarian")].text.apply(lambda x: len(enc.encode(x))).sum() +
      df_unlabelled[(df_unlabelled.witness_category == "Carbon-intensive Industry") & (df_unlabelled.witness_contrarian != "Contrarian")].shape[0])
print("Business & Services token count:", 
      df_unlabelled[(df_unlabelled.witness_category == "Business & Services") & (df_unlabelled.witness_contrarian != "Contrarian")].text.apply(lambda x: len(enc.encode(x))).sum() +
      df_unlabelled[(df_unlabelled.witness_category == "Business & Services") & (df_unlabelled.witness_contrarian != "Contrarian")].shape[0])
print("MoC token count:", 
      df_unlabelled[df_unlabelled.type == "MoC"].text.apply(lambda x: len(enc.encode(x))).sum() +
      df_unlabelled[df_unlabelled.type == "MoC"].shape[0])


TRAINING:
Training token count:  99286

INFERENCE:
Unlabelled contrarian token count: 94535
FFI token count: 160353
Carbon-intensive Industry token count: 182896
Business & Services token count: 141100
MoC token count: 1902562
