In [1]:
import pandas as pd
import os 
import json

In [2]:
splits_dir = "/mnt/beegfs/home/vasquez_reina/alvisnlp/lv_stage/datasets/clean_pesv_preprocessed/split"

In [3]:
os.listdir(splits_dir)

['pet_200',
 'pet_100',
 'pet_500',
 'balanced_dev_train',
 'pet_1000',
 'unbalanced',
 'pet_50']

# Scratch

In [5]:
test_df_path = "/mnt/beegfs/home/vasquez_reina/alvisnlp/lv_stage/datasets/clean_pesv_preprocessed/split/pet_50/parsed_trafilatura_abstract/dev.csv"

In [6]:
test_df = pd.read_csv(test_df_path)

In [12]:
test_df['has_subject'].value_counts().to_dict()

{0: 848, 1: 848}

# Get statistics

In [14]:
CONTENT_NAME_TO_FULL_NAME = {
 'Title': "parsed_trafilatura_title",
 'Abstract': "parsed_trafilatura_abstract",
 'Full text': "parsed_trafilatura_fulltext",
 'Translated Title': "translation_title",
 'Phrases with Keywords (Abstract)': "sentence_with_keywords_parsed_trafilatura_abstract_only_relevant_sentences",
 'Phrases with Keywords + O.C (Abstract)': "sentence_with_keywords_parsed_trafilatura_abstract_keep_original_content",
 'Phrases with Keywords (Full text)': "sentence_with_keywords_parsed_trafilatura_fulltext_only_relevant_sentences",
 'Phrases with Keywords + O.C (Full text)': "sentence_with_keywords_parsed_trafilatura_fulltext_keep_original_content",
}


In [13]:
def get_total_pos_neg_count(df_path):
    df = pd.read_csv(df_path)

    pos_neg_counts = df['has_subject'].value_counts().to_dict()
    pos_count = pos_neg_counts[1]
    neg_count = pos_neg_counts[0]
    size = len(df)
    return (size, pos_count, neg_count)

In [34]:
def get_statistics_for_training_method(
    trainig_method,
    dir      
):
    dir = os.path.join(dir, trainig_method)
    results_dict = {"Content Source": []}

    expected_splits = ["train", "dev", "test"]
    quantities = ("Size", "Positives", "Negatives")  

    if "pet" in trainig_method.lower():
        expected_splits.append("unlabeled")

    results_dict.update(
        dict(
            (f"{split_name.capitalize()} - {quantity}",[]) 
            for split_name in expected_splits 
            for quantity in quantities
        )
    )

    
    for content_name, content_full_name in CONTENT_NAME_TO_FULL_NAME.items():
        splits_dir_for_content = os.path.join(
            dir,
            content_full_name
        )


        results_dict["Content Source"].append(content_name)
        

        for split in os.listdir(splits_dir_for_content):
            split_path = os.path.join(splits_dir_for_content, split)
            split_name = split.removesuffix(".csv")

            size, pos, neg = get_total_pos_neg_count(split_path)

            for quantity, value in zip(quantities, (size, pos, neg)):

                results_dict[f"{split_name.capitalize()} - {quantity}"].append(value)


    results_df = pd.DataFrame(results_dict)
    #results_df = results_df.set_index("Content Source")

    return results_df



In [35]:
get_statistics_for_training_method(
    "balanced_dev_train",
    splits_dir
)

Unnamed: 0,Content Source,Train - Size,Train - Positives,Train - Negatives,Dev - Size,Dev - Positives,Dev - Negatives,Test - Size,Test - Positives,Test - Negatives
0,Title,12976,6488,6488,1622,811,811,1621,230,1391
1,Abstract,13578,6789,6789,1696,848,848,1696,234,1462
2,Full text,18028,9014,9014,2252,1126,1126,2252,274,1978
3,Translated Title,13086,6543,6543,1634,817,817,1634,227,1407
4,Phrases with Keywords (Abstract),3240,1620,1620,404,202,202,404,108,296
5,Phrases with Keywords + O.C (Abstract),13466,6733,6733,1682,841,841,1682,227,1455
6,Phrases with Keywords (Full text),11486,5743,5743,1434,717,717,1434,222,1212
7,Phrases with Keywords + O.C (Full text),16704,8352,8352,2088,1044,1044,2087,253,1834


In [36]:
for training_method in os.listdir(splits_dir):

    df = get_statistics_for_training_method(
        training_method,
        splits_dir
    )

    df.to_csv(
        f"{training_method}_splits_stats.csv"
    )