# Creare JSON dal GoldStandard

In [1]:
import csv
import json
import os
import pandas as pd
from sklearn.model_selection import train_test_split



# GS1_ train & test CSV split

In [36]:
def split_dataset(input_path, output_path_train_csv, output_path_test_csv, test_size=0.2, random_state=42):

    df = pd.read_csv(input_path, index_col=0)
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    print('Len train :' + str(len(train_df)))
    train_df.to_csv(output_path_train_csv, index=False)
    print('Len test :' + str(len(test_df)))
    test_df.to_csv(output_path_test_csv, index=False)
    
    return train_df, test_df

In [37]:
input_path = 'GoldStandard.csv'
output_path_train_csv = 'train/GS1_train.csv'
output_path_test_csv = 'train/GS1_test.csv'
split_dataset(input_path, output_path_train_csv, output_path_test_csv)

Len train :1272
Len test :318


(                      Firm     Sector  \
 1174  Ferrovie dello Stato  Transport   
 701                 Maersk  Transport   
 1479  Ferrovie dello Stato  Transport   
 528                 Maersk  Transport   
 987   Ferrovie dello Stato  Transport   
 ...                    ...        ...   
 1130  Ferrovie dello Stato  Transport   
 1294  Ferrovie dello Stato  Transport   
 860   Ferrovie dello Stato  Transport   
 1459  Ferrovie dello Stato  Transport   
 1126  Ferrovie dello Stato  Transport   
 
                                                Activity   NACE_1   NACE_2  \
 1174                  Infrastructure for rail transport   F42.12   F42.13   
 701   Retrofitting of sea and coastal freight and pa...   H50.10    H50.2   
 1479  Urban and suburban transport, road passenger t...   H49.31  H49.3.9   
 528                  Freight transport services by road  H49.4.1   H53.10   
 987                              Freight rail transport   H49.20   N77.39   
 ...                      

# GS1 train & test JSON

In [38]:
def get_json(input_path, output_path):
    df = pd.read_csv(input_path, index_col=0)
    df = df.loc[:, ['KeyConcepts', 'text', 'parse_correct']]

    # Crea la colonna instruction, cambio i nomi delle altre due colonne
    df['instruction'] = (
        "I will give you a text as 'input'. You must tell me if this text can be associated with this specific Key Concept: " 
        + df['KeyConcepts'].astype(str) 
        + ". You have to answer me with 0 if you think that these two cannot be associated. You have to answer me with 1 if these can be associated. Use only 1 or 0, nothing else."
    )
    df = df.rename(columns={"text": "input", "parse_correct": "output"})
    df = df.loc[:, ['input', 'output', 'instruction']]
    df.to_csv("resGS.csv", index=False)

    #transofrm csv into json
    with open('resGS.csv', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]

    with open(output_path, 'w') as jsonfile:
        json.dump(data, jsonfile)
        
        
    # delete the temporary file 
    if os.path.exists("resGS.csv"):
        os.remove("resGS.csv")
    print("Done")

    print("Done")

In [40]:
input_path_train_csv = 'train/GS1_train_csv.csv'
input_path_test_csv = 'train/GS1_test_csv.csv'

output_path_train = 'train/GS1_train.json'
output_path_test = 'train/GS1_test.json'

get_json(input_path_train_csv, output_path_train)
get_json(input_path_test_csv, output_path_test)

Done
Done
Done
Done


### GS1_test - Manual and Rephrase

In [41]:
def divide_json(input_path, manual_path, rephrase_path):
    df = pd.read_csv(input_path, index_col=0)
    
    df['Note'] = df['Note'].fillna('')

    manual_df = df[~df['Note'].str.contains('GPT4')]
    rephrase_df = df[df['Note'].str.contains('GPT4')]

    #controllata lunghezza e funziona
    
    manual_df.to_csv("manual.csv", index=False)
    rephrase_df.to_csv("rephrase.csv", index=False)
    
    get_json("manual.csv" , manual_path)
    if os.path.exists("manual.csv"):
        os.remove("manual.csv")
        
    get_json("rephrase.csv" , rephrase_path)
    if os.path.exists("rephrase.csv"):
        os.remove("rephrase.csv")
    
    print("DIVIDED")

In [42]:
input_path = "train/GS1_test_csv.csv"
manual_path = "train/GS1_test_manual.json"
rephrase_path = "train/GS1_test_rephrase.json"
divide_json(input_path, manual_path, rephrase_path)

Done
Done
Done
Done
DIVIDED


# GS1 & esggBert

In [59]:
input_path = "train/GS1_train_csv.csv"
esg_input_path = "./esgBERT.csv"
output_path = "train/esgBERT_GS1_train.json"

In [61]:
def get_json_esg(input_path, esg_input_path, output_path):
    df = pd.read_csv(input_path, index_col=None) #1200 righe
    esg = pd.read_csv(esg_input_path, index_col=None) #4k righe 
    df = df.loc[:,['KeyConcepts', 'text', 'parse_correct']]

    # creao la colonna instruction, cambio i nomi delle altre due colonne
    df['instruction'] = "I will give you a text as 'input', You must say me if this text can be associated with this specific Key Concept: " + df['KeyConcepts'].astype(str) + " You have to answer me with 0 if you think that these two can not be associated. You have to answer me with 1 if these can be associated. Use only 1 or 0, nothing else." 
    df = df.rename(columns={"text": "input"})
    df = df.rename(columns={"parse_correct": "output"})

    df = df.loc[:,['input', 'output', 'instruction']]

    df_concatenato = pd.concat([df, esg])

    df_concatenato_shuffled = df_concatenato.sample(frac=1).reset_index(drop=True)  

    df_concatenato_shuffled.to_csv("resGS.csv", index=False)
    
    #transofrm csv into json
    with open('resGS.csv', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]

    with open(output_path, 'w') as jsonfile:
        json.dump(data, jsonfile)
        
        
    # delete the temporary file 
    if os.path.exists("resGS.csv"):
        os.remove("resGS.csv")
    print("Done")

In [62]:
get_json_esg(input_path ,esg_input_path, output_path)

Done


# Prova

In [2]:
df = pd.read_csv("GoldStandard.csv", index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1590 entries, 0 to 1589
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Firm              1590 non-null   object
 1   Sector            1590 non-null   object
 2   Activity          1590 non-null   object
 3   NACE_1            1590 non-null   object
 4   NACE_2            1590 non-null   object
 5   NACE_3            1044 non-null   object
 6   NACE_4            804 non-null    object
 7   NACE_5            432 non-null    object
 8   NACE_6            432 non-null    object
 9   Filter words      1590 non-null   object
 10  Description       1590 non-null   object
 11  Climate           1590 non-null   object
 12  Type              1590 non-null   object
 13  KeyConcepts       1590 non-null   object
 14  ID                1590 non-null   int64 
 15  text              1590 non-null   object
 16  page              1590 non-null   int64 
 17  parse_correct     1

In [8]:
# Count the occurrences of each firm
firm_counts = df['Firm'].value_counts()

# Count the occurrences of 0s and 1s in the 'parse_correct' column
parse_correct_counts = df['parse_correct'].value_counts()

print("Occurrences per firm:")
print(firm_counts)

print("\nCounts of 0s and 1s in the 'parse_correct' column:")
print(parse_correct_counts)

Occurrences per firm:
Firm
Ferrovie dello Stato       690
Maersk                     492
Mundys                     348
Autostrade per l'Italia     60
Name: count, dtype: int64

Counts of 0s and 1s in the 'parse_correct' column:
parse_correct
0    1122
1     468
Name: count, dtype: int64
