# Generate the final dataframe, the results of all the models

This notebooks will cumulate the results of all the json file and obtain the summary dataframe

In [98]:
import json
import pandas as pd
import re
import glob
import os

In [86]:
with open(r"../data/processed/predicted_labelled_AG.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [147]:
df = pd.DataFrame(data, columns=["text", "label"])
print(df["label"].iloc[0])

{'label': [{'start': 2102, 'end': 2107, 'labels': 'AG', 'text_span': '3,9 %'}]}


In [152]:
def turn_dict_into_list_of_values(df):
    all_values = []

    for _, row in df.iterrows():
        label = row.get("label").get("label")
        values = []
        
        for item in label:
            text_span = item.get("text_span")
            matches = re.findall(r'(\d+(?:[\.,]\d+)?)\s*%?', text_span)
            
            for match in matches:
                value = float(match.replace(',', '.').rstrip('%'))  # Remove '%' and convert to float
                values.append(value)
        
        # Append the values for this row to the all_values list
        all_values.append(values)

    return all_values

In [89]:
def filter_values(all_values:list):
    filtered_values = []

    for values in all_values:
        # Use a list comprehension to filter values above 8 and replace 0.0 with None
        filtered_row = [value if value <= 8 and value != 0.0 else None for value in values]
        filtered_values.append(filtered_row)

    return filtered_values

In [90]:
def calculate_average(filtered_values:list):
    averages = []

    for row_values in filtered_values:
        # Filter out None values and calculate the average
        valid_values = [value for value in row_values if value is not None]
        
        if valid_values:  # Check if there are valid values to calculate the average
            avg = sum(valid_values) / len(valid_values)
        else:
            avg = None  # No valid values, so set average to None
        
        averages.append(avg)

    return averages

In [153]:
def from_predicted_label_to_dataset(json_file, save_to_csv=False, output_file=None):
    # Open the json file
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract label name from the json file name
    label_name = os.path.basename(json_file).split("_")[2][:-5]

    # Create a dataframe from the json file
    df = pd.DataFrame(data, columns=["text", "label"])

    # Turn the label column into a list of values
    all_values = turn_dict_into_list_of_values(df)  

    # Filter out values above 8 and replace 0.0 with None
    if label_name != "PPVm":
        filtered_values = filter_values(all_values)
        # Calculate the average of the values
        averages = calculate_average(filtered_values)
    else:
        averages = calculate_average(all_values)

    # Add the averages to the dataframe
    df["cleanned_label"] = averages

    # Drop the label column and rename the cleanned_label column to label
    df = df[['text', 'cleanned_label']].copy()
    df = df.rename(columns={'cleanned_label': f'{label_name}'})

    # Save the dataframe to a csv file
    if save_to_csv:
        df.to_csv(output_file, index=False)

    return df

df = from_predicted_label_to_dataset(r"../data/processed/predicted_labelled_AG.json", save_to_csv=True, output_file=r"../data/intermediate/AG_dataset.csv")
df.head()

Unnamed: 0,text,AG
0,l’enveloppe globale d’augmentation des rémunér...,3.9
1,nous travaillons sur une politique de rémunéra...,4.0
2,mesure d’augmentation perenne 1.1 - bénéficiai...,
3,accord. politique salariale : article 1 - augm...,3.5
4,augmentation collective une augmentation colle...,3.0


We iterate over all the predicted json files

In [154]:
directory = glob.glob(r"../data/processed/*.json")

for file in directory:
    df = from_predicted_label_to_dataset(file, save_to_csv=True, output_file=file.replace("processed", "intermediate").replace(" ","_").replace(".json", ".csv"))
    print(f"File {file} converted to csv")

File ../data/processed\predicted_labelled_AG CAD.json converted to csv
File ../data/processed\predicted_labelled_AG INT.json converted to csv
File ../data/processed\predicted_labelled_AG OUV.json converted to csv
File ../data/processed\predicted_labelled_AG.json converted to csv
File ../data/processed\predicted_labelled_AI CAD.json converted to csv
File ../data/processed\predicted_labelled_AI.json converted to csv
File ../data/processed\predicted_labelled_ATOT.json converted to csv
File ../data/processed\predicted_labelled_CAD.json converted to csv
File ../data/processed\predicted_labelled_INT.json converted to csv
File ../data/processed\predicted_labelled_NCAD.json converted to csv
File ../data/processed\predicted_labelled_OUV.json converted to csv
File ../data/processed\predicted_labelled_PPV.json converted to csv
File ../data/processed\predicted_labelled_PPVm.json converted to csv
File ../data/processed\predicted_labelled_TOUS.json converted to csv


Now we merge all the files on their text to obtain the final output

In [155]:
def concatenate_csv_files(folder_path: str, save_to_csv=False, output_file=None):
    # Initialize an empty DataFrame to store the concatenated data
    combined_df = None  # Initialize as None

    list_of_labels = ['PPVm', 'AG', 'AI', 'AG OUV', 'AG INT', 'AG CAD', 'AI OUV', 'AI INT', 'AI CAD', 'NOUV AG', 'NCAD AG', 'NOUV AI', 'NCAD AI', 'ATOT',
                      'ATOT OUV', 'ATOT INT', 'ATOT CAD']

    # Iterate through files in the specified folder
    for filename in os.listdir(folder_path):
        filename_label = filename[19:-4]

        if filename_label in list_of_labels and filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            print(f'Reading file: {file_path}')

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            if combined_df is None:
                combined_df = df
            else:
                # Merge the current DataFrame with the combined DataFrame on the "text" column
                combined_df = pd.merge(combined_df, df, on="text", how="outer", suffixes=("", f"_{filename_label}"))

    # Save the DataFrame to a CSV file
    if save_to_csv and output_file:
        combined_df.to_csv(output_file, index=False)

    return combined_df

df = concatenate_csv_files(r"../data/intermediate", save_to_csv=True, output_file=r"../data/processed/final_dataset.csv")

Reading file: ../data/intermediate\predicted_labelled_AG.csv
Reading file: ../data/intermediate\predicted_labelled_AI.csv
Reading file: ../data/intermediate\predicted_labelled_ATOT.csv
Reading file: ../data/intermediate\predicted_labelled_PPVm.csv


In [156]:
def clean_dataset(df_file_path: str, save_to_csv=False, output_file=None):
    df = pd.read_csv(df_file_path)

    # Limit to two decimal places
    df = df.round(2)

    # Remove rows with NaN values
    df = df.dropna(axis=0, how="all")

    if save_to_csv:
        df.to_csv(output_file, index=False)
    
    return df

clean_dataset(r"../data/processed/final_dataset.csv", save_to_csv=True, output_file=r"../data/processed/final_dataset.csv")

Unnamed: 0,text,AG,AI,ATOT,PPVm
0,l’enveloppe globale d’augmentation des rémunér...,3.9,,,
1,nous travaillons sur une politique de rémunéra...,4.0,,,500.0
2,mesure d’augmentation perenne 1.1 - bénéficiai...,,,,
3,accord. politique salariale : article 1 - augm...,3.5,3.5,,3.0
4,augmentation collective une augmentation colle...,3.0,3.0,,
...,...,...,...,...,...
270,accord de negociation annuelle obligatoire 202...,,,,270.0
271,proces verbal d’accord négociations annuelles ...,,,,600.0
272,accord sur les salaires effectifs et avantages...,,,,400.0
273,entre maisons du monde france sas. article 2 –...,,,,400.0
