In [None]:
# import libraries
import pandas as pd 
import numpy as np
from IPython.display import display, Markdown
import sys
from pathlib import Path

# detection of the project root, ensure being in / notebooks
current_path = Path().resolve()
project_root = current_path.parents[1] 
annotation_path = project_root / "AI_narrative_index" / "annotation"

# Add to Python path
sys.path.append(str(annotation_path))

# Now import the required functions from the module
from comparing_annotations import resolve_label_disagreements_AI, resolve_hype_disagreements

Review first batch

In [None]:
# read the csv files for the first batch
first_batch_author = pd.read_csv("articles_WSJ_batch_one_author.csv")
first_batch_annotator = pd.read_csv("articles_WSJ_batch_one_annotator.csv", encoding='cp1252') 

In [None]:
# investigate distribution of hype level and label_ai_related in annotator's annotation
print(f"distribution of hype level: {first_batch_annotator['hype_level'].value_counts()}")
print(f"distribution of label_ai_related: {first_batch_annotator['label_ai_related'].value_counts()}")
print(f"Number of articles with AI-related annotation: {first_batch_annotator['label_ai_related'].sum()}")

# investigate distribution of hype level and label_ai_related in aauthor's annotation
print(f"distribution of hype level author: {first_batch_author['hype_level'].value_counts()}")
print(f"distribution of label_ai_related author: {first_batch_author['label_ai_related'].value_counts()}")
print(f"Number of articles with AI-related annotation author: {first_batch_author['label_ai_related'].sum()}") 

Inspect the dataframes, ensure compatibility

In [None]:
# change the nan values to 0 in the author's dataframe
first_batch_author['hype_level'] = first_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
first_batch_annotator['hype_level'] = first_batch_annotator['hype_level'].astype(float) 
first_batch_author['hype_level'] = first_batch_author['hype_level'].astype(float) 

# check if datatype of the label column is float
print(first_batch_annotator['hype_level'].dtype) 
print(first_batch_author['hype_level'].dtype)

# check total values of hype levels in the review dataframe
print(first_batch_annotator['hype_level'].sum()) # 
print(first_batch_author['hype_level'].sum()) #

# print unique values of the hype level column in the review dataframe
print(first_batch_annotator['hype_level'].unique()) 
print(first_batch_author['hype_level'].unique()) 


As a suggestion, the annotator labeled some articles as hype = 3, but the descision was made to set a max of 2

In [None]:
# set hype level to 2 if hype level is 3 in the review dataframe
first_batch_annotator.loc[first_batch_annotator['hype_level'] == 3, 'hype_level'] = 2 

# verify the change
print(first_batch_annotator['hype_level'].unique()) 


In [None]:
# Align dataframes on article_id 
merged = first_batch_annotator.merge(first_batch_author, on="article_id", suffixes=('_annotator', '_author'))

# Count differences in classifications
print(f'Total differences in hype classification: {((merged["hype_level_annotator"] != merged["hype_level_author"]).sum()) / len(merged["hype_level_annotator"]) }')
print(f'Total differences in ai_related classification: {((merged["label_ai_related_annotator"] != merged["label_ai_related_author"]).sum())/len(merged)}')

In [None]:
# use the resolve_label_disagreements function to resolve the AI label disagreements between the two dataframes
df_final_first_batch = resolve_label_disagreements(first_batch_author, first_batch_annotator)

In [None]:
# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_first_batch = resolve_hype_disagreements(first_batch_author, df_final_first_batch)

# write the final dataframe to a csv file
df_final_first_batch.to_csv("articles_WSJ_batch_one_final.csv", index=False) # write the final dataframe to a csv file

Review second batches

In [None]:
# read the second batches from csv files
second_batch_annotator = pd.read_csv("articles_WSJ_batch_two_annotator.csv")
second_batch_author = pd.read_csv("articles_WSJ_batch_two_author.csv")

Inspect the dataframes, ensure compatibility

In [None]:
# print the columns of the annotated second batch
print(f"Columns in the annotated second batch: {second_batch_annotator.columns}")

# inspect the columns of the annotated second batch
print(f"Number of articles in the annotated second batch: {len(second_batch_annotator)}")

In [None]:
# align column names with authors annotation
second_batch_annotator = second_batch_annotator.rename(columns={"AI_RELEVANT": "label_ai_related", "HYPE_LEVEL": "hype_level"})

# change the nan values to 0 in the author's dataframe
second_batch_author['hype_level'] = second_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
second_batch_annotator['hype_level'] = second_batch_annotator['hype_level'].astype(float) 
second_batch_author['hype_level'] = second_batch_author['hype_level'].astype(float)

# check if dtype of the label column is float
print(second_batch_annotator['hype_level'].dtype)
print(second_batch_author['hype_level'].dtype) 

In [None]:
# print the values of the hype level column in the annotators second batch
print(f"Values of the hype level column in the annotated second batch: {second_batch_annotator['hype_level'].unique()}")

# print the values of the label_ai_related column in the annotators second batch
print(f"Values of the label_ai_related column in the annotated second batch: {second_batch_annotator['label_ai_related'].unique()}")

# print the values of the hype level column in the author's second batch
print(f"Values of the hype level column in the author's second batch: {second_batch_author['hype_level'].unique()}")

# print the values of the label_ai_related column in the author's second batch
print(f"Values of the label_ai_related column in the author's second batch: {second_batch_author['label_ai_related'].unique()}")

# number of articles with AI-related annotation in the second batch
print(f"Number of articles with AI-related annotation in the second batch: {second_batch_annotator['label_ai_related'].sum()}")

# number of articles with AI-related annotation in the second batch author
print(f"Number of articles with AI-related annotation in the second batch author: {second_batch_author['label_ai_related'].sum()}")

# total hype levels in the second batch
print(f"Total hype levels in the second batch: {second_batch_annotator['hype_level'].sum()}")

# total hype levels in the second batch author
print(f"Total hype levels in the second batch author: {second_batch_author['hype_level'].sum()}")

In [None]:
# Align dataframes on article_id 
merged = second_batch_annotator.merge(second_batch_author, on="article_id", suffixes=('_annotator', '_author'))

# Count differences in classifications
print(f'Total differences in hype classification: {((merged["hype_level_annotator"] != merged["hype_level_author"]).sum()) / len(merged["hype_level_annotator"]) }')
print(f'Total differences in ai_related classification: {((merged["label_ai_related_annotator"] != merged["label_ai_related_author"]).sum())/len(merged)}')

In [None]:
# solve the label disagreements between the two dataframes using the resolve_label_disagreements function
df_ai_level_merge  = resolve_label_disagreements_AI(second_batch_author, second_batch_annotator)

In [None]:
# check df_ai_level_merge
print(f"Number of changes in the merged dataframe: {df_ai_level_merge['modified'].sum()}")
print(f"Number of articles with ai-related annotation: {df_ai_level_merge['label_ai_related'].sum()}")


In [None]:
# change the nan values to 0 in the author's dataframe
second_batch_author['hype_level'] = second_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
df_ai_level_merge['hype_level'] = df_ai_level_merge['hype_level'].astype(float) # convert the hype level column to int
second_batch_author['hype_level'] = second_batch_author['hype_level'].astype(float) # convert the hype level column to int

# check if type of the label column is float
print(df_ai_level_merge['hype_level'].dtype) # check the type of the label column	
print(second_batch_author['hype_level'].dtype) # check the type of the label column

# check total values of hype levels in the review dataframe
print(df_ai_level_merge['hype_level'].sum()) # 
print(second_batch_author['hype_level'].sum()) #

# print unique values of the hype level column in the review dataframe
print(df_ai_level_merge['hype_level'].unique()) # check the unique values of the hype level column in the review dataframe
print(second_batch_author['hype_level'].unique()) # check the unique values of the hype level column in the author dataframe

In [None]:
# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_second_batch = resolve_hype_disagreements(second_batch_author, df_ai_level_merge)

In [None]:
# write the final dataframe to a csv file
df_final_second_batch.to_csv(r"second_batch_WSJ_final.csv", index=False) # write the final dataframe to a csv file

Review third batches

In [None]:
# read the third batches from csv files
third_batch_annotator = pd.read_csv(r"articles_WSJ_batch_three_annotator.csv")
third_batch_author = pd.read_csv(r"articles_WSJ_batch_three_author.csv")

In [None]:
# print the columns of the annotated third batch
print(f"Columns in the annotated third batch: {third_batch_annotator.columns}")

# inspect the columns of the annotated third batch
print(f"Number of articles in the annotated third batch: {len(third_batch_annotator)}")

In [None]:
# align column names with authors annotation
third_batch_annotator = third_batch_annotator.rename(columns={"AI_Relevant": "label_ai_related", "Hype_Level": "hype_level"})

# change the nan values to 0 in the author's dataframe
third_batch_author['hype_level'] = third_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
third_batch_annotator['hype_level'] = third_batch_annotator['hype_level'].astype(float) # convert the hype level column to int
third_batch_author['hype_level'] = third_batch_author['hype_level'].astype(float) # convert the hype level column to int

# check if type of the label column is float
print(third_batch_annotator['hype_level'].dtype) # check the type of the label column	
print(third_batch_author['hype_level'].dtype) # check the type of the label column

# compare the hype levels in the two dataframes
print(f'The total hype levels in the annotator\'s dataframe: {third_batch_annotator["hype_level"].sum()}')
print(f'The total hype levels in the author\'s dataframe: {third_batch_author["hype_level"].sum()}')

# compare the ai_reated levels in the two dataframes
print(f'The total ai_reated levels in the annotator\'s dataframe: {third_batch_annotator["label_ai_related"].sum()}')
print(f'The total ai_reated levels in the author\'s dataframe: {third_batch_author["label_ai_related"].sum()}')

# Align dataframes on article_id 
merged = third_batch_annotator.merge(third_batch_author, on="article_id", suffixes=('_annotator', '_author'))

# Count differences in classifications
print(f'Total differences in hype classification: {(merged["hype_level_annotator"] != merged["hype_level_author"]).sum()}')
print(f'Total differences in ai_related classification: {(merged["label_ai_related_annotator"] != merged["label_ai_related_author"]).sum()}')


In [None]:
# Align dataframes on article_id 
merged = third_batch_annotator.merge(third_batch_author, on="article_id", suffixes=('_annotator', '_author'))

# Count differences in classifications
print(f'Total differences in hype classification: {((merged["hype_level_annotator"] != merged["hype_level_author"]).sum()) / len(merged["hype_level_annotator"]) }')
print(f'Total differences in ai_related classification: {((merged["label_ai_related_annotator"] != merged["label_ai_related_author"]).sum())/len(merged)}')

Resolve disagreements between the author and the annotator

In [None]:
# solve the label disagreements between the two dataframes using the resolve_label_disagreements function
third_df_ai_level_merge = resolve_label_disagreements_AI(third_batch_author, third_batch_annotator)


In [None]:
# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_third_batch = resolve_hype_disagreements(third_batch_author, third_df_ai_level_merge)

In [None]:
# verify the annotation process
print(f"Number of articles in the third sample: {len(df_final_third_batch)}")
print(f"columns in the third sample: {df_final_third_batch.columns}")
print(f"Number of articles with ai-related annotation: {df_final_third_batch['label_ai_related'].sum()}")


In [None]:
# write the final dataframe to a csv file
df_final_third_batch.to_csv("articles_WSJ_batch_three_final.csv", index=False) 

In [None]:
# verify the csv
df_final_third_batch = pd.read_csv("articles_WSJ_batch_three_final.csv")
print(f"Number of articles in the third sample: {len(df_final_third_batch)}")
print(f"columns in the third sample: {df_final_third_batch.columns}")

Review fraction of batch four annotated by author and annotator (sampled at random)

In [None]:
# read the csv file of the fourth batch
fourth_batch_author = pd.read_csv("articles_WSJ_batch_four_subsample_author.csv")
fourth_batch_annotator = pd.read_csv("articles_WSJ_batch_four_annotator.csv", encoding='cp1252') 

In [None]:
# verify data frames
print(f"The fourth batch annotated by the author contains {len(fourth_batch_author)} rows")
print(f"The fourth batch annotated by the annotator contains {len(fourth_batch_annotator)} rows")

In [None]:
# extract rows from annotators df also in authors subset
subset_annotator = fourth_batch_annotator[fourth_batch_annotator["article_id"].isin(fourth_batch_author["article_id"])]

# verify subset
print(len(subset_annotator) == 175)
print(subset_annotator.columns)

In [None]:
# lower column names
subset_annotator.rename(columns=str.lower, inplace=True)

# change ai_relevant name
subset_annotator.rename(columns={"ai_relevant": "label_ai_related"}, inplace=True)

# verify manipulations
print(subset_annotator.columns)

In [None]:
# inspect values
print(subset_annotator[["label_ai_related","hype_level"]])
print(subset_annotator[["label_ai_related","hype_level"]].dtypes)
print(fourth_batch_author[["label_ai_related","hype_level"]])
print(fourth_batch_author[["label_ai_related","hype_level"]].dtypes)

In [None]:
# set missing values to 0 in authors df
fourth_batch_author["hype_level"].fillna(0, inplace=True)

# verify the change
print(fourth_batch_author["hype_level"])

In [None]:
# Align dataframes on article_id 
merged = subset_annotator.merge(fourth_batch_author, on="article_id", suffixes=('_annotator', '_author'))

# Count differences in classifications
print(f'Total differences in hype classification: {((merged["hype_level_annotator"] != merged["hype_level_author"]).sum()) / len(merged["hype_level_annotator"]) }')
print(f'Total differences in ai_related classification: {((merged["label_ai_related_annotator"] != merged["label_ai_related_author"]).sum())/len(merged)}')

In [None]:
# solve the label disagreements between the two dataframes using the resolve_label_disagreements function
fourth_sub_df_ai_level_merge = resolve_label_disagreements_AI(fourth_batch_author, subset_annotator)

In [None]:
# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_fourth_batch_sub = resolve_hype_disagreements(fourth_batch_author, fourth_sub_df_ai_level_merge)

In [None]:
# save sampled & resolved articles as csv
df_final_fourth_batch_sub.to_csv("articles_WSJ_batch_four_sample_final.csv", index=False)

In [None]:
# verify csv
df_final_fourth_batch_sub = pd.read_csv("articles_WSJ_batch_four_sample_final.csv")
print(df_final_fourth_batch_sub.head())
print(f"There are {len(df_final_fourth_batch_sub)} articles in the annotated subset")

In [None]:
# load fourth_batch_annotator
df_final_fourth_batch = pd.read_csv("articles_WSJ_batch_four_annotator.csv",encoding='cp1252')
print(fourth_batch_annotator.columns)

In [None]:
# Set index to article_id for both DataFrames
df_final_fourth_batch.set_index("article_id", inplace=True)
df_final_fourth_batch_sub.set_index("article_id", inplace=True)

# Update all columns in-place where IDs match
df_final_fourth_batch.update(df_final_fourth_batch_sub)

# Reset index back to column 
df_final_fourth_batch.reset_index(inplace=True)

# lower column names
df_final_fourth_batch.rename(columns=str.lower, inplace=True)

# change ai_relevant name
df_final_fourth_batch.rename(columns={"ai_relevant": "label_ai_related"}, inplace=True)

# verify manipulations
print(len(df_final_fourth_batch))
print(df_final_fourth_batch.columns)

In [None]:
# write final batch to csv
df_final_fourth_batch.to_csv("articles_WSJ_batch_four_final.csv", index=False) 