# Data preparation for finetuning the classifiers

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import openai
import os


# Load the data
df = pd.read_json('../Data/cat_hearings_03_10_utterances_witnesses_MoCs_labels.json')
df = df[df.labelled == True].reset_index(drop=True)
# Add a new dummy combining policy claim 4.4 and 4.5
df["L2dummy_4_45"] = df.apply(lambda x: [1.0 if i > 0 else 0.0 for i in [x.L2dummy_4_4 + x.L2dummy_4_5]][0], axis = 1)
# Add a new dummy combining policy claim 4.3 ,4.4, 4.5 and 4.6
df["L2dummy_4_3456"] = df.apply(lambda x: [1.0 if i > 0 else 0.0 for i in [x.L2dummy_4_3 + x.L2dummy_4_4 + x.L2dummy_4_5 + x.L2dummy_4_6]][0], axis = 1)
df.head(1)

Unnamed: 0,congress,chamber,committee,committee_short,year,date,title,hearing_id,type,last_name,...,L4claims,L4claims_multi_hot,L1policyclaims,L2policyclaims,L3policyclaims,L4policyclaims,L2policyclaims_multi_hot,labelled,L2dummy_4_45,L2dummy_4_3456
0,108,SENATE,Committee on Environment and Public Works,Environment and Public Works,2003,2003-04-08,The Clear Skies Act of 2003,108shrg91748,witness,Rogers,...,[0],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.0,[0.0],[0.0],[0.0],"[1, 0, 0, 0, 0, 0, 0, 0]",True,0.0,0.0


In [30]:
os.getcwd()

'/Users/mn/Library/CloudStorage/OneDrive-UniversityofExeter/Projects/GitHub/contrarian-discourses-against-cap-and-trade/Code'

In [4]:
df.batch.value_counts()

Batch 1: Random sample                                        694
Batch 3: Active learning sample (claim 4)                     517
Batch 2: Active learning sample (oversampling rare claims)    476
Name: batch, dtype: int64

In [26]:
CLAIMS = ['L2dummy_4_1', 'L2dummy_4_2', 'L2dummy_4_3456']

# Split the data into training, validation and testing data (70-15-15 split)

# Create stratified random train, val and test data sets

df["CLAIMS"] = df.loc[:,CLAIMS].values.tolist()

# Reserve a testing data set of only the randomly sampled labelled data (First batch)
train, test = train_test_split(df[df.batch == "Batch 1: Random sample"],
                               test_size=0.364,
                               random_state=12, 
                               stratify = df[df.batch == "Batch 1: Random sample"].CLAIMS, 
                               shuffle=True)

# Merge the active learning batches with the remaining random sample training data
train = pd.concat([train, df[df.batch != "Batch 1: Random sample"]])

# Split the training data into training and validation sets
train, val = train_test_split(train, 
                              test_size=0.176, 
                              random_state=12,
                              stratify = train.CLAIMS, 
                              shuffle=True)

# Print the number or paragraphs in each data set
print("Training data set size: ", len(train))
print("Validation data set size: ", len(val))
print("Testing data set size: ", len(test))

# Transform the labels into the correct form (drop 4.7 as no observations in data)
train["labels"] = train.loc[:,CLAIMS].values.astype(int).tolist()
val["labels"] = val.loc[:,CLAIMS].values.astype(int).tolist()
test["labels"] = test.loc[:,CLAIMS].values.astype(int).tolist()

# Print the number of level 2 policy claims in each data set and the percentage of the total
print("Training data level 2 policy claims:\n")
for i in range(1, 3):
    print("Claim 4_{} count: ".format(i), train["L2dummy_4_{}".format(i)].value_counts()[1], "({}%)".format(round(train["L2dummy_4_{}".format(i)].value_counts()[1] / len(train) * 100, 2)))
print("Claim 4_3456 count:", train["L2dummy_4_3456"].value_counts()[1], "({}%)".format(round(train["L2dummy_4_3456"].value_counts()[1] / len(train) * 100, 2)))
print()
print("Validation data level 2 policy claims:")
for i in range(1, 3):
    print("Claim 4_{} count: ".format(i), val["L2dummy_4_{}".format(i)].value_counts()[1], "({}%)".format(round(val["L2dummy_4_{}".format(i)].value_counts()[1] / len(val) * 100, 2)))
print("Claim 4_3456 count:", val["L2dummy_4_3456"].value_counts()[1], "({}%)".format(round(val["L2dummy_4_3456"].value_counts()[1] / len(val) * 100, 2)))
print()
print("Testing data level 2 policy claims:")
for i in range(1, 3):
    print("Claim 4_{} count: ".format(i), test["L2dummy_4_{}".format(i)].value_counts()[1], "({}%)".format(round(test["L2dummy_4_{}".format(i)].value_counts()[1] / len(test) * 100, 2)))
print("Claim 4_3456 count:", test["L2dummy_4_3456"].value_counts()[1], "({}%)".format(round(test["L2dummy_4_3456"].value_counts()[1] / len(test) * 100, 2)))


# Note, there are no occurences of the new claim 4_7: No need for more action

Training data set size:  1181
Validation data set size:  253
Testing data set size:  253
Training data level 2 policy claims:

Claim 4_1 count:  276 (23.37%)
Claim 4_2 count:  128 (10.84%)
Claim 4_3456 count: 169 (14.31%)

Validation data level 2 policy claims:
Claim 4_1 count:  60 (23.72%)
Claim 4_2 count:  27 (10.67%)
Claim 4_3456 count: 36 (14.23%)

Testing data level 2 policy claims:
Claim 4_1 count:  38 (15.02%)
Claim 4_2 count:  17 (6.72%)
Claim 4_3456 count: 35 (13.83%)


In [28]:
# Save the train / valid / test datasets
train.to_json("../Classifiers/Data/cat_hearings_03_10_train.json")
val.to_json("../Classifiers/Data/cat_hearings_03_10_val.json")
test.to_json("../Classifiers/Data/cat_hearings_03_10_test.json")

## Level 1: All claims
We transform the dataset into a pandas dataframe, with a column for prompt and completion. The prompt contains the email from the mailing list, and the completion is a name of the sport, either hockey or baseball. For demonstration purposes only and speed of fine-tuning we take only 300 examples. In a real use case the more examples the better the performance.

In [4]:
# Save all level 2 policy dummies for the train, val and test data to jsonl files
pd.DataFrame(zip(train.text, train.L1dummy_1), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_1.jsonl", orient='records', lines=True)
pd.DataFrame(zip(train.text, train.L1dummy_2), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_2.jsonl", orient='records', lines=True)
pd.DataFrame(zip(train.text, train.L1dummy_3), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_3.jsonl", orient='records', lines=True)
pd.DataFrame(zip(train.text, train.L1dummy_4), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_4.jsonl", orient='records', lines=True)
pd.DataFrame(zip(train.text, train.L1dummy_5), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_5.jsonl", orient='records', lines=True)

pd.DataFrame(zip(val.text, val.L1dummy_1), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_1.jsonl", orient='records', lines=True)
pd.DataFrame(zip(val.text, val.L1dummy_2), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_2.jsonl", orient='records', lines=True)
pd.DataFrame(zip(val.text, val.L1dummy_3), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_3.jsonl", orient='records', lines=True)
pd.DataFrame(zip(val.text, val.L1dummy_4), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_4.jsonl", orient='records', lines=True)
pd.DataFrame(zip(val.text, val.L1dummy_5), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_5.jsonl", orient='records', lines=True)

pd.DataFrame(zip(test.text, test.L1dummy_1), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_1.jsonl", orient='records', lines=True)
pd.DataFrame(zip(test.text, test.L1dummy_2), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_2.jsonl", orient='records', lines=True)
pd.DataFrame(zip(test.text, test.L1dummy_3), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_3.jsonl", orient='records', lines=True)
pd.DataFrame(zip(test.text, test.L1dummy_4), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_4.jsonl", orient='records', lines=True)
pd.DataFrame(zip(test.text, test.L1dummy_5), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_5.jsonl", orient='records', lines=True)


### Data Preparation tool
We can now use a data preparation tool which will suggest a few improvements to our dataset before fine-tuning. Before launching the tool we update the openai library to ensure we're using the latest data preparation tool.

In [5]:
# Use the openai tools to prepare the data for fine-tuning
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_1.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_2.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_3.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_4.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_5.jsonl

!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_1.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_2.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_3.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_4.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_5.jsonl

!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_1.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_2.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_3.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_4.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_5.jsonl


Analyzing...

- Your file contains 1181 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for

## Level 2: Policy claims (IV)
We transform the dataset into a pandas dataframe, with a column for prompt and completion. The prompt contains the email from the mailing list, and the completion is a name of the sport, either hockey or baseball. For demonstration purposes only and speed of fine-tuning we take only 300 examples. In a real use case the more examples the better the performance.

In [6]:
# Save all level 2 policy dummies for the train, val and test data to jsonl files
pd.DataFrame(zip(train.text, train.L2dummy_4_1), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_4_1.jsonl", orient='records', lines=True)
pd.DataFrame(zip(train.text, train.L2dummy_4_2), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_4_2.jsonl", orient='records', lines=True)
pd.DataFrame(zip(train.text, train.L2dummy_4_3456), columns = ['prompt','completion']).to_json("../Classifiers/Data/train_4_3456.jsonl", orient='records', lines=True)


pd.DataFrame(zip(val.text, val.L2dummy_4_1), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_4_1.jsonl", orient='records', lines=True)
pd.DataFrame(zip(val.text, val.L2dummy_4_2), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_4_2.jsonl", orient='records', lines=True)
pd.DataFrame(zip(val.text, val.L2dummy_4_3456), columns = ['prompt','completion']).to_json("../Classifiers/Data/val_4_3456.jsonl", orient='records', lines=True)

pd.DataFrame(zip(test.text, test.L2dummy_4_1), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_4_1.jsonl", orient='records', lines=True)
pd.DataFrame(zip(test.text, test.L2dummy_4_2), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_4_2.jsonl", orient='records', lines=True)
pd.DataFrame(zip(test.text, test.L2dummy_4_3456), columns = ['prompt','completion']).to_json("../Classifiers/Data/test_4_3456.jsonl", orient='records', lines=True)

### Data Preparation tool
We can now use a data preparation tool which will suggest a few improvements to our dataset before fine-tuning. Before launching the tool we update the openai library to ensure we're using the latest data preparation tool.

In [7]:
# Use the openai tools to prepare the data for fine-tuning
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_4_1.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_4_2.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/train_4_3456.jsonl

!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_4_1.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_4_2.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/val_4_3456.jsonl

!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_4_1.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_4_2.jsonl
!echo -e "Y\nY\nn\nY"|openai tools fine_tunes.prepare_data -f ../Classifiers/Data/test_4_3456.jsonl


Analyzing...

- Your file contains 1181 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for