In [1]:
import pandas as pd
import gcsfs
from google.cloud import storage
import re
import random

from training_data import TrainingData
import config

In [2]:
# helpers
def get_transform_data(load_path: str, source: str) -> list[str]:
    if source == "GCS":
        load_path = f"gs://{load_path}"
    data = pd.read_parquet(load_path)
    text_data = data["text"].to_list()
    # more preprocessing can be applied
    processed_data = [re.sub("\s\s+", " ", t) for t in text_data] 
    return processed_data

In [3]:
# download scraped data from GCS bucket for prelabelling via spacy ruler
# 1. creates TrainingData class instance and can be applied on multiple parquet files in GCS bucket
# glob folder for .parquet files
# if GOOGLE CREDENTIALS PATH in config, use it
try:
    fs = gcsfs.GCSFileSystem(token = config.GOOGLE_CREDENTIALS_PATH)
except:
    # if not, fall back to environment variable
    fs = gcsfs.GCSFileSystem()
files_list = [f for f in fs.ls(config.SCRAPED_DATA_BUCKET) if ".parquet" in f]
print(files_list)

# process files and move to processed in GCS
training_data_list = []
for file in files_list:
    data = get_transform_data(file, "GCS")  
    labeled = TrainingData.pre_label_data(data, config.RULES_BUCKET_NAME, config.RULES_BLOB_NAME)
    training_data_list.append(labeled)
    # remove from bucket if processed
    # disabled for testing
    '''
    file_name = file.split("/")[-1]
    fs.move(path1=file, path2=f"{config.PROCESSED_FILES_BUCKET}/{file_name}")
    '''

# merge training data (controlled for duplicates) to get all pre-labeled data
training_data = TrainingData.merge(training_data_list)

['startdate_ml/scraped_data/test_parser_data_2022-09-08 18_18_19.parquet', 'startdate_ml/scraped_data/test_parser_data_2022-09-08 18_32_19.parquet']
Info: no ner component in pipeline
Info: no ner component in pipeline


In [4]:
# 2. upload to labelbox for manual labelling (for all or parts of the data)
# thi scan be skipped 
# randomize the data inside the TrainingData class (list)
random.shuffle(training_data.data)
# upload to Labelbox
training_data.upload_data_to_labelbox(config.LABELBOX_PROJECT_NAME)

COMPLETE, total time (s)=2.04




Total time=27.98, Time per item=1.55


In [5]:
# 3. when manually labeled, download
# note project must be completely labeled to be downloaded
verified_training_data = TrainingData.from_labelbox_data(config.LABELBOX_PROJECT_NAME)

In [7]:
# 4. convert to spacy training data and save  
# tranform to spacy training data format (list of docs) and dump for load by training script (now in colab)
verified_training_data.transform_to_spacy_and_save_docs(config.SPACY_TRAINING_DATA_FILE)

Docs binary data saved to location gs://startdate_ml/spacy_docs/training_data.spacy


In [10]:
# 5. Train RoBERTa model on training data
# The training itself is managed in Colab for performance reasons
# colab notebook: https://colab.research.google.com/drive/1vhXS2GnUE56m7mZTyc43t9wHrqqRezdj?usp=sharing
