# 2. Annotated dataframes split to sentence rows
This notebook transforms the dataframes of step 1 into a dataframe in which each row corresponds to one sentence

In [None]:
import pandas as pd
import ast
import numpy as np

##Assuming 3 rounds here, to be adjusted depending on your annotation rounds

first_round = pd.read_csv(r'.\Intermediate results\first round.csv', sep = '\t')
second_round = pd.read_csv(r'.\Intermediate results\second round.csv', sep = '\t')
third_round = pd.read_csv(r'.\Intermediate results\third round.csv', sep = '\t')

In [None]:
#Select only useful columns--> to be changed for other datasets
columns = ['annotator', 'censored', 'location', 'manual_sentence_labels', 'note_PS_manual', 'note_nr', 'previous_ann', 'pseudo_id', 'pseudonomised_text', 'relevance_PS_manual', 'relevance_manual', 'report_date', 'report_type', 'round', 'sentences', 'set', 'source_table', 'text_id']

first_round_adjudicated = first_round_adjudicated[columns]
second_round_adjudicated = second_round_adjudicated[columns]
test_round_adjudicated = test_round_adjudicated[columns]

In [None]:
# Function to safely evaluate strings that look like Python lists
def safe_literal_eval(val):
    try:
        if isinstance(val, str):
            # Replace 'nan' strings with 'None' first
            val = val.replace('nan', 'None')
            val = val.replace('np.float64(0.0)', '0')
            val = val.replace('np.float64(1.0)', '1')
            val = val.replace('np.float64(2.0)', '2')
            val = val.replace('np.float64(3.0)', '3')
            val = val.replace('np.float64(4.0)', '4')
            val = val.replace('np.float64(5.0)', '5')
            val = val.replace('np.float64(1.5)', '1.5')
            # Safely evaluate the string as a Python literal
            result = ast.literal_eval(val)
            
            # Convert None to np.nan and ensure numbers are integers
            def convert_value(v):
                if v is None:
                    return np.nan
                elif isinstance(v, (int, float,np.float64)) and not np.isnan(v):
                    print(v)
                    return int(v)
                else:
                    print(v)
                    return v
            
            return [convert_value(v) for v in result]
        return val
    except (ValueError, SyntaxError) as e:
        # Return the original value if evaluation fails
        print(f"Error evaluating {val}: {e}")
        return val

# Function to apply safe_literal_eval to specified columns in a DataFrame
def back_to_lists(df, columns_to_convert):
    print("NEW DF")
    for col in columns_to_convert:
        print("COLUMN:", col)
        df[col] = df[col].apply(safe_literal_eval)
    return df

# List of DataFrames
dfs = [
    first_round, 
    second_round, 
    test_round
]

# Columns you want to convert using safe_literal_eval
columns_to_convert = ['manual_sentence_labels', 'relevance_manual', 'sentences']

# Apply the back_to_lists function to each DataFrame in the list
for i in range(len(dfs)):
    dfs[i] = back_to_lists(dfs[i], columns_to_convert)




In [None]:
end_df = pd.concat([first_round, second_round, test_round])

In [None]:
#Annotate sentences with Performance Status equal to 5, when they contain the sentence "patient overleden"

print((end_df['sentences'][end_df['pseudonomised_text'].str.contains("patient overleden")]).index)

end_df.iloc[0, end_df.columns.get_loc('manual_sentence_labels')] = [5]
end_df.iloc[0, end_df.columns.get_loc('note_PS_manual')] = 5
end_df.iloc[0, end_df.columns.get_loc('relevance_PS_manual')] = 1
end_df.iloc[0, end_df.columns.get_loc('relevance_manual')] = [1]
#end_df.at[0, 'manual_sentence_labels'] = [5]
#end_df['note_PS_manual'][0] = 5
#end_df['relevance_PS_manual'][0] = 1
#end_df['relevance_manual'][0] = [1]

print(end_df[['manual_sentence_labels', 'note_PS_manual', 'relevance_PS_manual', 'relevance_manual']].head())

In [None]:
sentences_df = pd.DataFrame(columns = ['note_nr', 'sentence', 'manual_sentence_labels', 'relevance_manual', 'round', 'annotator', 'set'])

for index, row in end_df.iterrows():

    for i in range(len(row['sentences'])):
        sentence = row['sentences'][i]
        PS = row['manual_sentence_labels'][i]
        relevance = row['relevance_manual'][i]
        #sentences_df = sentences_df.append({'note_nr': row['note_nr'], 'sentence': sentence, 'manual_sentence_labels': PS, 'relevance_manual': relevance, 'round': row['round'], 'annotator': row['annotator'], 'set': row['set']}, ignore_index=True)
        sentences_df=pd.concat([sentences_df,pd.DataFrame([{'note_nr': row['note_nr'], 'sentence': sentence, 'manual_sentence_labels': PS, 'relevance_manual': relevance, 'round': row['round'], 'annotator': row['annotator'], 'set': row['set']}])],ignore_index=True)

In [None]:
sentences_val = sentences_df[sentences_df['set'] == 'val']
sentences_test = sentences_df[sentences_df['set'] == 'test']

In [None]:
print("VAL:")
print(sentences_val['manual_sentence_labels'].value_counts(dropna=False))
print(len(sentences_val))
print("TEST:")
print(sentences_test['manual_sentence_labels'].value_counts(dropna=False))

In [None]:
sentences_val.to_csv(r'./Intermediate results/sentences_val.csv', sep = '\t')

In [None]:
sentences_test.to_csv(r'./Intermediate results/sentences_test.csv', sep = '\t')