In [40]:
import numpy as np
import pandas as pd
import os

import re
import json

Manual for the Config file：
- `dataset_file_name`: Please include the **full** file name including the datatype, e.g. data.csv, data.tsv. If the datasets are splitted into different sets, seperate the names with a comma `,`
- `dataset_name`: The name of the dataset.
- `label_name_definition`: Please write the label name and corresponding definition in a `JSON` format.
- `source`: For data with a single source, please state the source name, e.g. Twitter, Facebook, etc. Add an `@` symbol in-front, e.g. *@Twitter*. If of multi-source, please provide a column name.
- `language`: For single language, language code as stipulated in ISO 639-2 are recognized, e.g. eng, spa, chi, ger, fre, ita, etc (). Add an `@` symbol in-front, e.g. *@eng*. For multi-language contents, please provide a column name describing this property, e.g. languages.
- `text`: The column name of text.

**Note**:
1. Only datasets that are in CSV(comma-seperated) or TSV formats are supported.
2.

### Utils

In [53]:
def strip_comma_from_text_list(orig_list):
    res = []
    for val in orig_list:
        if ',' in val:
            # Split the string by comma
            strip_list = [item.strip() for item in val.split(',') if item.strip()]
            res = res + strip_list
        else:
            # If no commas, keep the string as is
            res.append(val)

    return res

In [None]:
def is_columns_in_datasets(df, col_list):
    cols_not_in_dataset = list(set(col_list) - set(df.columns))
    assert len(cols_not_in_dataset) == 0, f"In the list provided, {cols_not_in_dataset} are not found in the given dataset"

In [None]:
def read_dataframe(file_path):
    """
    Read a DataFrame from a CSV or TSV file.

    Args:
    - file_path (str): Path to the file.

    Returns:
    - pandas.DataFrame: DataFrame containing the data from the file.
    """
    if file_path.endswith('.csv'):
        # Read CSV file
        df = pd.read_csv(file_path)
    elif file_path.endswith('.tsv'):
        # Read TSV file
        df = pd.read_csv(file_path, sep='\t')
    else:
        # Unsupported file type
        raise ValueError("Unsupported file type. Only CSV and TSV files are supported.")

    return df

### General Processing

In [54]:
df_config =  pd.read_csv("config.csv", sep=';',encoding= "utf-8").apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df_config

Unnamed: 0,dataset_file_name,dataset_name,label_name_definition,source,language,text
0,olid-training-v1.0.tsv,OLID19,"{""subtask_a"": ""Level A: Offensive language ide...",@Twitter,@eng,tweet
1,"SBFv2.dev.csv, SBFv2.trn.csv, SBFv2.tst.csv",SBFv2,"{""intentYN"":"""", ""sexYN"":"""", ""sexReason"":"""", ""o...",@Twitter,@eng,post


In [55]:
folder_path = "./data"
folder_path_file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
folder_path_file_names

['olid-training-v1.0.tsv', 'SBFv2.dev.csv', 'SBFv2.trn.csv', 'SBFv2.tst.csv']

In [56]:
config_file_name_value = df_config.dataset_file_name.tolist()

config_file_names =  strip_comma_from_text_list(config_file_name_value)
config_file_names

['olid-training-v1.0.tsv', 'SBFv2.dev.csv', 'SBFv2.trn.csv', 'SBFv2.tst.csv']

In [63]:
assert len(config_file_names) <= len(folder_path_file_names), f"Number of datasets listed in Config ({len(config_file_names)}) exceeds that in the folder {len(df_config)}."

difference_datasets = set(config_file_names) - set(folder_path_file_names)
assert len(difference_datasets) == 0, f"Dataset(s) {difference_datasets} not found in 'data' folder"
df_config.label_name_definition = df_config.label_name_definition.apply(lambda x: json.loads(x))
print("Filename integrity check complete!")

Filename integrity check complete!


### Row-specific Operation

In [65]:
for idx, row in df_config.iterrows():
    datasets = strip_comma_from_text_list([row.dataset_file_name])
    for dataset in datasets:
        df = read_dataframe(folder_path + '/' + dataset)
        col_list = list(row.label_name_definition.keys()) + [row.text]
        if row.language.startswith('@') == False:
            col_list.append(row.language)
        if row.source.startswith('@') == False:
            col_list.append(row.source)

        print(col_list)
        is_columns_in_datasets(df, col_list)


['subtask_a', 'subtask_b', 'subtask_c', 'tweet']
['intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'post']
['intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'post']
['intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'post']
