### Create datasets

In [33]:
# load data from local csv file
import pandas as pd

filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral"
path = f"../data/transcript_chunks/{filename}.csv"

data = pd.read_csv(path, sep=";")
print(f"Loaded df with {len(data)} rows (=chunks) and {data['video_id'].nunique()} unique video_ids.")
#print(data.head(5))

# add metadata info which we might want to include in the prompt
# handle loading of lists from csv with ast.literal_eval()
import ast
def load_list(x):
    return ast.literal_eval(x) if x else None
metadata = pd.read_csv("../scraping/6_filtered_videos_final/filtered_metadata.csv", sep=";", header=0, converters={"tags": load_list, 
                                                                                                        "yt_auto_categories": load_list})
#metadata.head()
# for now we're only interested in the uploader_id and title as well as the first three tags (although these don't exist for all videos!)
nrows_before_merge = len(data)
data = data.merge(metadata[["video_id", "uploader_id", "yt_video_type", "title", "first_three_tags"]], on="video_id", how="left")

if nrows_before_merge != len(data):
    print("Warning: Check merging, number of rows changed. Possibly duplicates in metadata dfs?")
data.head()

Loaded df with 80176 rows (=chunks) and 45967 unique video_ids.


Unnamed: 0,video_id,chunk_number,chunk_text,uploader_id,yt_video_type,title,first_three_tags
0,i2bUeO1ID30,1,my grandma thinks Christmas is expensive so I'...,@JennyHoyosLOL,short,$5 Christmas Gift,"christmas, Christmas present, christmas presen..."
1,VvEBCXHx-74,1,you can find golden dirt this is a 25 bag of d...,@JennyHoyosLOL,short,I Bought $25 Dirt to Find Gold,"pay dirt, gold prospecting, gold mining"
2,CEdnanNgS3k,1,one dollar chicken sandwich now Chick-fil-A ha...,@JennyHoyosLOL,short,$1 Chicken Sandwich vs Chick-Fil-A,"chick fil a, chicken sandwich, food hacks"
3,jOc1XfFNJTo,1,Logan Paul made from Prime apparently over 100...,@JennyHoyosLOL,short,How Much Logan Paul Made From Prime,"logan paul, ksi prime, drink prime"
4,Gs0QiMVkUAw,1,two dollar pumpkin spice lattes apparently you...,@JennyHoyosLOL,short,$2 Pumpkin Spice Latte at Starbucks,"pumpkin spice latte, pumpkin spice, starbucks ..."


In [7]:
# save to csv, ready for huggingface upload
#data.to_csv(f"../data/transcript_chunks/{filename}_with_metadata_for_prompt.csv", sep=";", index=False)

In [2]:
# load from csv
import pandas as pd
#filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral"
data = pd.read_csv(f"../data/transcript_chunks/{filename}_with_metadata_for_prompt.csv", sep=";")

# convert to huggingface dataset
from datasets import Dataset
dataset = Dataset.from_pandas(data)
print(dataset)
del data

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['video_id', 'chunk_number', 'chunk_text', 'uploader_id', 'yt_video_type', 'title', 'first_three_tags'],
    num_rows: 80176
})


In [3]:
# load dataset from hf

from datasets import load_dataset
#filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral"
dataset = load_dataset("JanJacobsen/youtube_finfluencer_transcripts", 
                       data_files=f"{filename}_with_metadata_for_prompt.csv",
                       sep=";")
dataset = dataset["train"] # if there are no splits hf loads as "train" by default
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['video_id', 'chunk_number', 'chunk_text', 'uploader_id', 'view_count', 'yt_video_type', 'title', 'first_three_tags'],
        num_rows: 82991
    })
})


### Splitting the dataset

- if labelling and/or finetuning turn out to be too slow, we can add back some examples to the inference set later (while being careful not to introduce bias of course)
- note: data is shuffled in the process, so if we want to have the descending avg views by channel ordering for inference we need to merge again with the ordered index file

In [6]:
# split dataset into finetuning, test and inference set

seed = 42
ft_size = 400
valid_size = 150
dataset = dataset.train_test_split(test_size=ft_size+valid_size, seed=seed)

ds_inference = dataset["train"]
ds_finetuning = dataset["test"].train_test_split(test_size=valid_size, seed=seed)["train"]
ds_validation = dataset["test"].train_test_split(test_size=valid_size, seed=seed)["test"]

print(f"Inference set: {len(ds_inference)} samples")
print(f"Finetuning set: {len(ds_finetuning)} samples")
print(f"Validation set: {len(ds_validation)} samples")


Inference set: 79626 samples
Finetuning set: 400 samples
Validation set: 150 samples


### Prompt formatting function(s)

In [1]:
# implemented in LLM_utils.py
from LLM_utils import format_prompt

In [45]:
# test prompt formatting
prompt = format_prompt(feature_row=ds_inference[2345],
                       prompt_format="mistral",
                       include_answer_tease=True,
                       include_eos=False)['prompt']
print(prompt)

<s>[INST] The triple-quoted text below is part of a youtube video transcript by channel @josephhogue with the title '5 Growth Stocks to Buy Now for the 2022 Rebound'. The top tags for the video are: 'growth stocks, growth stocks 2022, best growth stocks'. Read the transcript carefully in order to perform the asset name extraction task specified below the transcript.

"""growth stocks are getting slammed again in 2022 with shares of disruptive companies like shopify down 35 percent and zoom plunging 60 over the last year these are companies changing the world and anyone holds the potential to 10x your money but the pain is expected to continue throughout this year so you need to know how to find these best stocks in this video i'll show you exactly how to find the growth stocks to buy in 2022 with a simple stock screener to start your list and how to narrow it down i'll then reveal the five grow stocks to buy now one with a 75 return just to the average analyst target before we get star

### Saving the three datasets (+ adding prompts)

In [28]:
# save splits to csv
path = "../data/transcript_chunks/splits"
filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt"

ds_inference.to_csv(f"{path}/INF_{filename}.csv", sep=";", index=False)
ds_finetuning.to_csv(f"{path}/FT_{filename}.csv", sep=";", index=False)
ds_validation.to_csv(f"{path}/VAL_{filename}.csv", sep=";", index=False)

Creating CSV from Arrow format: 100%|██████████| 80/80 [00:08<00:00,  9.51ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 25.05ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 63.32ba/s]


913199

In [34]:
# check to make sure csv/pandas/arrow conversions don't introduce any discrepancies
path = "../data/transcript_chunks/splits"
filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt"
loaded_inf = Dataset.from_pandas(pd.read_csv(f"{path}/INF_{filename}.csv", sep=";"))
loaded_ft = Dataset.from_pandas(pd.read_csv(f"{path}/FT_{filename}.csv", sep=";"))
loaded_val = Dataset.from_pandas(pd.read_csv(f"{path}/VAL_{filename}.csv", sep=";"))

if not loaded_inf.to_pandas().equals(ds_inference.to_pandas()):
    diff = loaded_inf.to_pandas().compare(ds_inference.to_pandas())
    print(f"Discrepancies found in the following columns of inference data: {diff.columns.levels[0][0]}")
    print(diff)

if not loaded_ft.to_pandas().equals(ds_finetuning.to_pandas()):
    diff = loaded_ft.to_pandas().compare(ds_finetuning.to_pandas())
    print(f"Discrepancies found in the following columns of finetuning data: {diff.columns.levels[0][0]}")
    print(diff)

if not loaded_val.to_pandas().equals(ds_validation.to_pandas()):
    diff = loaded_val.to_pandas().compare(ds_validation.to_pandas())
    print(f"Discrepancies found in the following columns of validation data: {diff.columns.levels[0][0]}")
    print(diff)

In [6]:
# load finetuning and validation sets
from datasets import Dataset
import pandas as pd
path = "../data/transcript_chunks/splits"
filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt"

ds_finetuning = Dataset.from_pandas(pd.read_csv(f"{path}/FT_{filename}.csv", sep=";"))
ds_validation = Dataset.from_pandas(pd.read_csv(f"{path}/VAL_{filename}.csv", sep=";"))

In [7]:
# save an adjusted version of ft and valid splits to csv for labelling 

# add plain prompts, select columns for excel sheets and save
ds = ds_finetuning.map(format_prompt, fn_kwargs={"prompt_format": "plain", "include_answer_tease": True, "include_label": False, "include_eos": False})
ds = ds.select_columns(['video_id', 'chunk_number', 'yt_video_type', 'uploader_id', 'prompt', 'title', 'first_three_tags', 'chunk_text'])
ds.to_csv(f"../data/transcript_chunks/labeling/FT_{filename}_for_labeling.csv", sep=";", index=False)

ds = ds_validation.map(format_prompt, fn_kwargs={"prompt_format": "plain", "include_answer_tease": True, "include_label": False, "include_eos": False})
ds = ds.select_columns(['video_id', 'chunk_number', 'yt_video_type', 'uploader_id', 'prompt', 'title', 'first_three_tags', 'chunk_text'])
ds.to_csv(f"../data/transcript_chunks/labeling/VAL_{filename}_for_labeling.csv", sep=";", index=False)

Map: 100%|██████████| 400/400 [00:00<00:00, 6921.24 examples/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  8.83ba/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 8795.30 examples/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 30.36ba/s]


2037876

### Labeled datasets

In [1]:
# read in data from excel sheet with labels

import pandas as pd

labeled_path = "../data/transcript_chunks/labeling/labeling_sheet.xlsx"
val_labeled = pd.read_excel(labeled_path, sheet_name="VAL_labeling", header=1)
ft_labeled = pd.read_excel(labeled_path, sheet_name="FT_labeling", header=1)


In [2]:
# select only required columns
cols = ['video_id', 'chunk_number', 'label']
val_labeled = val_labeled[cols]
ft_labeled = ft_labeled[cols]

In [6]:
# confirm all labels match json schema
import json
from jsonschema import validate
from LLM_utils import output_json_schema_string

output_json_schema = json.loads(output_json_schema_string)

def validate_labels(df, desc_str):
    for _, row in df.iterrows():
        try:
            loaded = json.loads(row['label'])
        except:
            print(f"Non-loadable label found in {desc_str} labels (video_id {row['video_id']}, chunk_number {row['chunk_number']}):\n{row['label']}")
        if loaded:
            try:
                validate(loaded, output_json_schema)
            except:
                print(f"Schema-violating json found in {desc_str} labels (video_id {row['video_id']}, chunk_number {row['chunk_number']}):\n{row['label']}")

validate_labels(val_labeled, "VAL")
validate_labels(ft_labeled, "FT")
# no output -> everything matches

In [7]:
# load original ft and val splits to merge with labels
splits_path = "../data/transcript_chunks/splits"
filename = "transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt"
ds_ft = pd.read_csv(f"{splits_path}/FT_{filename}.csv", sep=";")
ds_val = pd.read_csv(f"{splits_path}/VAL_{filename}.csv", sep=";")

print(f"Shapes before merging - FT: {ds_ft.shape}, VAL: {ds_val.shape}")
# merge labels
ds_ft = ds_ft.merge(ft_labeled, on=["video_id", "chunk_number"], how="left")
ds_val = ds_val.merge(val_labeled, on=["video_id", "chunk_number"], how="left")

print(f"Shapes after merging - FT: {ds_ft.shape}, VAL: {ds_val.shape}")

Shapes before merging - FT: (400, 7), VAL: (150, 7)
Shapes after merging - FT: (400, 8), VAL: (150, 8)


In [9]:
# save to csv
ds_ft.to_csv(f"{splits_path}/FT_{filename}_with_labels.csv", sep=";", index=False)
ds_val.to_csv(f"{splits_path}/VAL_{filename}_with_labels.csv", sep=";", index=False)

### Biased Version of Finetuning Set

- Since the high frequency of "[]" labels (i.e. transcripts with no recommendations) is possibly making the model too conservative in extracting recommendations, we create versions of the finetuning set in which the "[]" examples make up a lesser fraction of the data (by removing some of them). 

In [33]:
# load ft set
import pandas as pd
ft_path = "../data/transcript_chunks/splits"
ft_filename = "FT_transcript_chunks_nvids45968_chunksize2048_overlap50_tokMistral_with_metadata_for_prompt_with_labels"

df_ft = pd.read_csv(f"{ft_path}/{ft_filename}.csv", sep=";")

n_empty = len(df_ft[df_ft['label'] == '[]'])
p_empty = n_empty/len(df_ft)
print(f"{n_empty}/{len(df_ft)} empty labels found in FT set.")
print(f"Proportion of empty labels: {p_empty:.4f}")

251/400 empty labels found in FT set.
Proportion of empty labels: 0.6275


In [34]:
# set desired proportion of empty labels in dataset
p_empty_desired = 0.5
seed = 42

if p_empty_desired > p_empty:
    raise(ValueError("Desired proportion of empty labels is bigger than the actual proportion of empty labels in the dataset."))

# calculate number of empty labels to remove
n_empty_to_remove = int((p_empty - p_empty_desired) * len(df_ft) / (1 - p_empty_desired))

df_ft = df_ft.drop(df_ft[df_ft['label'] == '[]'].sample(n=n_empty_to_remove).index)

p_empty_after = len(df_ft[df_ft['label'] == '[]'])/len(df_ft)

print(f"Removed {n_empty_to_remove} empty labels from FT set.")
print(f"New empty/total examples: {len(df_ft[df_ft['label'] == '[]'])}/{len(df_ft)}")

print(f"New empty label proportion: {p_empty_after:.4f}")

Removed 101 empty labels from FT set.
New empty/total examples: 150/299
New empty label proportion: 0.5017


In [35]:
# save biased version of dataset
df_ft.to_csv(f"{ft_path}/FT_biased{p_empty_after:.2f}empty_{ft_filename[3:]}.csv", sep=";", index=False)