# Transform Pandas DF to Verseagility-JSON

- Give a pandas dataframe as input with the columns "label" and "text" to bring it in the correct JSON-format
- Just keep the other dummy variables as they are to make sure the data stays valid. You can just remove the question/answering component from your case file, so that this part does not get trained
- You can also change the values for url, date etc. if you have a respective column in your dataset

In [None]:
# Import packages
import pandas as pd
import uuid
import json
import logging

In [None]:
# Change this to your format respectively
fname = 'file.csv'
df = pd.read_csv(fname, sep="\t", encoding="utf-8")

In [None]:
def transform_json(df, language, chunk=0):
    fname = f"output-{language}-train-{chunk}.json"
    with open(fname, "w", encoding='utf-8') as file:
        file.write("[")
    k = 0
    for index, row in df.iterrows():
        fileid = row['id']
        q_views = row['views']
        q_tags = row['appliesTo']
        url = row['url']
        lang = language

        # PACK Q JSON
        question = {}
        question['title'] = row['question.title']
        question['author'] = row['question.author']
        question['createdAt'] = row['question.createdAt']
        question['text'] = row['question.text']
        question['upvotes'] = int(row['question.upvotes'])

        # PACK A JSON
        answer = {}
        answer['markedAsAnswer'] = str(row['answer.markedAsAnswer'])
        answer['createdAt'] = row['answer.createdAt']
        answer['text'] = row['answer.text']
        answer['upvotes'] = int(row['answer.upvotes'])

        # PACK JSON
        data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}
        content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)

        # WRITE TO JSON FILE
        with open(fname, "a", encoding='utf-8') as file:
            if k == len(df) - 1: # cannot take index as it is read chunk-wise and therefore misleading
                file.write(content + "]")
            else:
                file.write(content + ",")
        k = k + 1
    try:
        with open(fname) as f:
            json.load(f)
        logging.info(f'[INFO] - File {fname} is valid!')
    except Exception as e:
        logging.error(f'File {fname} seems to be invalid -> {e}.')
    logging.info(f"[SUCCESS] - File {chunk} -> {k} / {len(df)}")

In [None]:
# Initiate the transformation, if you want to transform the data set. Do this if you have less than 10.000 documents in your dataset
transform_json(df, "en-us")

# Chunk the data sets
## Please note:
- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function
    - Íf you have less than 10.000 documents, you may go ahead and simply upload the file to the BLOB storage using the Azure Storage Explorer
- The following section helps you to read a large file and split it into chunks
- Below, there is a script to upload them one-by-one while having a break for five minutes to unload the pipeline

In [None]:
def get_chunks(lang, language):
    print(f'[INFO] - Start reading data chunks for {lang}.')
    i = 0
    for _ in pd.read_csv(f'data_{lang}.txt', sep="\t", encoding='utf-8', chunksize=5000):
        transform_json(_, language, i)
        i = i + 1

In [None]:
get_chunks(lang, language)

# Copy to BLOB
- Upload all the files of the export folder to the BLOB storage

In [None]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings
from datetime import datetime
import time
import logging
import pandas as pd

In [None]:
def copy_to_blob(local_file_path, blobstring, container):
    # Create a blob client using the local file name as the name for the blob
    logging.info(f'[INFO] - Initiating upload to BLOB-storage.')
    blob_service_client = BlobServiceClient.from_connection_string(blobstring)
    logging.info(f'[INFO] - Built connection to BLOB storage.')
    for path, subdirs, files in os.walk(local_file_path):
        for name in files:
            try:
                path_full = os.path.join(path, name)
                path_blob = os.path.join(path, name).replace(local_file_path, "")
                logging.info(f'[UPLOAD - {datetime.now()}] - Uploading to Azure Storage as BLOB: {path_blob}.')
                blob_client = blob_service_client.get_blob_client(container=container, blob=path_blob)
                # Upload the created file
                with open(path_full, "rb") as data:
                    blob_client.upload_blob(data, content_settings=ContentSettings(content_type='application/json'))
                logging.info(f'[INFO - {datetime.now()}] - Upload completed, sleeping for 10 minutes ... zZz ...')
                time.sleep(600)
            except Exception as e:
                logging.error(f'[STATUS - {datetime.now()}] - Copy to BLOB failed -> {e}.')
    logging.info(f'[STATUS - {datetime.now()}] - Successfully uploaded to BLOB.')

In [None]:
blobstring = "DefaultEndpointsProtocol=https;AccountName=###getyourblobstringhere###;AccountKey=###getyourkeyhere###;EndpointSuffix=core.windows.net"

In [None]:
copy_to_blob(f"export-{lang}/", blobstring, "data")