In [1]:
# import libraries
import pandas as pd 
from IPython.display import display, Markdown
from merging_annotations import resolve_label_disagreements_AI, resolve_hype_disagreements

Review first batch

In [2]:
# read the csv files for the first batch
first_batch_author = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one_author.csv")
first_batch_annotator = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one_annotator.csv", encoding='cp1252') 

In [None]:
# investigate distribution of hype level and label_ai_related in annotator's annotation
print(f"distribution of hype level: {first_batch_annotator['hype_level'].value_counts()}")
print(f"distribution of label_ai_related: {first_batch_annotator['label_ai_related'].value_counts()}")
print(f"Number of articles with AI-related annotation: {first_batch_annotator['label_ai_related'].sum()}")

# investigate distribution of hype level and label_ai_related in aauthor's annotation
print(f"distribution of hype level author: {first_batch_author['hype_level'].value_counts()}")
print(f"distribution of label_ai_related author: {first_batch_author['label_ai_related'].value_counts()}")
print(f"Number of articles with AI-related annotation author: {first_batch_author['label_ai_related'].sum()}") 

distribution of hype level: hype_level
0    64
1    22
2    12
3     2
Name: count, dtype: int64
distribution of label_ai_related: label_ai_related
0    59
1    41
Name: count, dtype: int64
Number of articles with AI-related annotation: 41
distribution of hype level author: hype_level
1.0    15
2.0     6
0.0     5
Name: count, dtype: int64
distribution of label_ai_related author: label_ai_related
0    74
1    26
Name: count, dtype: int64
Number of articles with AI-related annotation author: 26


Inspect the dataframes, ensure compatibility

In [4]:
# change the nan values to 0 in the author's dataframe
first_batch_author['hype_level'] = first_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
first_batch_annotator['hype_level'] = first_batch_annotator['hype_level'].astype(float) 
first_batch_author['hype_level'] = first_batch_author['hype_level'].astype(float) 

# check if datatype of the label column is float
print(first_batch_annotator['hype_level'].dtype) 
print(first_batch_author['hype_level'].dtype)

# check total values of hype levels in the review dataframe
print(first_batch_annotator['hype_level'].sum()) # 
print(first_batch_author['hype_level'].sum()) #

# print unique values of the hype level column in the review dataframe
print(first_batch_annotator['hype_level'].unique()) 
print(first_batch_author['hype_level'].unique()) 


float64
float64
52.0
27.0
[1. 2. 0. 3.]
[0. 2. 1.]


As a suggestion, the annotator labeled some articles as hype = 3, but the descision was made to set a max of 2

In [5]:
# set hype level to 2 if hype level is 3 in the review dataframe
first_batch_annotator.loc[first_batch_annotator['hype_level'] == 3, 'hype_level'] = 2 

# verify the change
print(first_batch_annotator['hype_level'].unique()) 


[1. 2. 0.]


In [8]:
# fraction of disagreement of label_ai_related in the first batch
print(f"Fraction of disagreement about AI-relatedness in the first batch: {len(first_batch_annotator[first_batch_annotator['label_ai_related'] != first_batch_author['label_ai_related']]) / len(first_batch_author)}")

# fraction of disagreement of hype level in the first batch
print(f"Fraction of disagreement at the hype level in the first batch: {len(first_batch_annotator[first_batch_annotator['hype_level'] != first_batch_author['hype_level']]) / len(first_batch_author)}")

Fraction of disagreement about AI-relatedness in the first batch: 0.19
Fraction of disagreement at the hype level in the first batch: 0.32


In [None]:
# use the resolve_label_disagreements function to resolve the AI label disagreements between the two dataframes
df_final_first_batch = resolve_label_disagreements(first_batch_author, first_batch_annotator)

In [None]:
# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_first_batch = resolve_hype_disagreements(first_batch_author, df_final_first_batch)

# write the final dataframe to a csv file
df_final_first_batch.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_one_final.csv", index=False) # write the final dataframe to a csv file

Review second batches

In [22]:
# read the second batches from csv files
second_batch_annotator = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two_annotator.csv")
second_batch_author = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_two_author.csv")

Inspect the dataframes, ensure compatibility

In [21]:
# print the columns of the annotated second batch
print(f"Columns in the annotated second batch: {second_batch_annotator.columns}")

# inspect the columns of the annotated second batch
print(f"Number of articles in the annotated second batch: {len(second_batch_annotator)}")

Columns in the annotated second batch: Index(['article_id', 'title', 'sub_title', 'cleaned_corpus',
       'label_ai_related', 'hype_level'],
      dtype='object')
Number of articles in the annotated second batch: 118


In [23]:
# align column names with authors annotation
second_batch_annotator = second_batch_annotator.rename(columns={"AI_RELEVANT": "label_ai_related", "HYPE_LEVEL": "hype_level"})

# change the nan values to 0 in the author's dataframe
second_batch_author['hype_level'] = second_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
second_batch_annotator['hype_level'] = second_batch_annotator['hype_level'].astype(float) 
second_batch_author['hype_level'] = second_batch_author['hype_level'].astype(float)

# check if dtype of the label column is float
print(second_batch_annotator['hype_level'].dtype)
print(second_batch_author['hype_level'].dtype) 

float64
float64


In [24]:
# print the values of the hype level column in the annotators second batch
print(f"Values of the hype level column in the annotated second batch: {second_batch_annotator['hype_level'].unique()}")

# print the values of the label_ai_related column in the annotators second batch
print(f"Values of the label_ai_related column in the annotated second batch: {second_batch_annotator['label_ai_related'].unique()}")

# print the values of the hype level column in the author's second batch
print(f"Values of the hype level column in the author's second batch: {second_batch_author['hype_level'].unique()}")

# print the values of the label_ai_related column in the author's second batch
print(f"Values of the label_ai_related column in the author's second batch: {second_batch_author['label_ai_related'].unique()}")

# number of articles with AI-related annotation in the second batch
print(f"Number of articles with AI-related annotation in the second batch: {second_batch_annotator['label_ai_related'].sum()}")

# number of articles with AI-related annotation in the second batch author
print(f"Number of articles with AI-related annotation in the second batch author: {second_batch_author['label_ai_related'].sum()}")

# total hype levels in the second batch
print(f"Total hype levels in the second batch: {second_batch_annotator['hype_level'].sum()}")

# total hype levels in the second batch author
print(f"Total hype levels in the second batch author: {second_batch_author['hype_level'].sum()}")

Values of the hype level column in the annotated second batch: [0. 1. 2.]
Values of the label_ai_related column in the annotated second batch: [0 1]
Values of the hype level column in the author's second batch: [0. 1. 2.]
Values of the label_ai_related column in the author's second batch: [0 1]
Number of articles with AI-related annotation in the second batch: 21
Number of articles with AI-related annotation in the second batch author: 7
Total hype levels in the second batch: 24.0
Total hype levels in the second batch author: 8.0


In [25]:
# fraction of disagreement of label_ai_related in the first batch
print(f"Fraction of disagreement about AI-relatedness in the first batch: {len(second_batch_annotator[second_batch_annotator['label_ai_related'] != second_batch_author['label_ai_related']]) / len(second_batch_author)}")

# fraction of disagreement of hype level in the first batch
print(f"Fraction of disagreement at the hype level in the first batch: {len(second_batch_annotator[second_batch_annotator['hype_level'] != second_batch_author['hype_level']]) / len(second_batch_author)}")

Fraction of disagreement about AI-relatedness in the first batch: 0.11864406779661017
Fraction of disagreement at the hype level in the first batch: 0.11864406779661017


In [None]:
# solve the label disagreements between the two dataframes using the resolve_label_disagreements function
df_ai_level_merge  = resolve_label_disagreements_AI(second_batch_author, second_batch_annotator)

In [None]:
# check df_ai_level_merge
print(f"Number of changes in the merged dataframe: {df_ai_level_merge['modified'].sum()}")
print(f"Number of articles with ai-related annotation: {df_ai_level_merge['label_ai_related'].sum()}")


In [None]:
# change the nan values to 0 in the author's dataframe
second_batch_author['hype_level'] = second_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
df_ai_level_merge['hype_level'] = df_ai_level_merge['hype_level'].astype(float) # convert the hype level column to int
second_batch_author['hype_level'] = second_batch_author['hype_level'].astype(float) # convert the hype level column to int

# check if type of the label column is float
print(df_ai_level_merge['hype_level'].dtype) # check the type of the label column	
print(second_batch_author['hype_level'].dtype) # check the type of the label column

# check total values of hype levels in the review dataframe
print(df_ai_level_merge['hype_level'].sum()) # 
print(second_batch_author['hype_level'].sum()) #

# print unique values of the hype level column in the review dataframe
print(df_ai_level_merge['hype_level'].unique()) # check the unique values of the hype level column in the review dataframe
print(second_batch_author['hype_level'].unique()) # check the unique values of the hype level column in the author dataframe

In [None]:
# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_second_batch = resolve_hype_disagreements(second_batch_author, df_ai_level_merge)

In [None]:
# write the final dataframe to a csv file
df_final_second_batch.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\second_batch_WSJ_final.csv", index=False) # write the final dataframe to a csv file

Review third batches

In [28]:
# read the third batches from csv files
third_batch_annotator = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three_annotator.csv")
third_batch_author = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\articles_WSJ_batch_three_author.csv")

In [29]:
# print the columns of the annotated third batch
print(f"Columns in the annotated third batch: {third_batch_annotator.columns}")

# inspect the columns of the annotated third batch
print(f"Number of articles in the annotated third batch: {len(third_batch_annotator)}")

Columns in the annotated third batch: Index(['article_id', 'index_id', 'scanned_time', 'title', 'sub_title',
       'corpus', 'AI_Relevant', 'Hype_Level', 'section', 'date'],
      dtype='object')
Number of articles in the annotated third batch: 100


In [30]:
# align column names with authors annotation
third_batch_annotator = third_batch_annotator.rename(columns={"AI_Relevant": "label_ai_related", "Hype_Level": "hype_level"})

# change the nan values to 0 in the author's dataframe
third_batch_author['hype_level'] = third_batch_author['hype_level'].fillna(0) # change the nan values to 0 in the review dataframe

# float hype_levels
third_batch_annotator['hype_level'] = third_batch_annotator['hype_level'].astype(float) # convert the hype level column to int
third_batch_author['hype_level'] = third_batch_author['hype_level'].astype(float) # convert the hype level column to int

# check if type of the label column is float
print(third_batch_annotator['hype_level'].dtype) # check the type of the label column	
print(third_batch_author['hype_level'].dtype) # check the type of the label column

# compare the hype levels in the two dataframes
print(f'The total hype levels in the annotator\'s dataframe: {third_batch_annotator["hype_level"].sum()}')
print(f'The total hype levels in the author\'s dataframe: {third_batch_author["hype_level"].sum()}')

# compare the ai_reated levels in the two dataframes
print(f'The total ai_reated levels in the annotator\'s dataframe: {third_batch_annotator["label_ai_related"].sum()}')
print(f'The total ai_reated levels in the author\'s dataframe: {third_batch_author["label_ai_related"].sum()}')

# compare total differences in classification
print(f'The total differences in hype classification: {len(third_batch_annotator[third_batch_annotator["hype_level"] != third_batch_author["hype_level"]])}')
print(f'The total differences in hype classification: {len(third_batch_annotator[third_batch_annotator["label_ai_related"] != third_batch_author["label_ai_related"]])}')

float64
float64
The total hype levels in the annotator's dataframe: 46.0
The total hype levels in the author's dataframe: 41.0
The total ai_reated levels in the annotator's dataframe: 35
The total ai_reated levels in the author's dataframe: 29
The total differences in hype classification: 15
The total differences in hype classification: 8


In [31]:
# fraction of disagreement of label_ai_related in the first batch
print(f"Fraction of disagreement about AI-relatedness in the first batch: {len(third_batch_annotator[third_batch_annotator['label_ai_related'] != third_batch_author['label_ai_related']]) / len(third_batch_author)}")

# fraction of disagreement of hype level in the first batch
print(f"Fraction of disagreement at the hype level in the first batch: {len(third_batch_annotator[third_batch_annotator['hype_level'] != third_batch_author['hype_level']]) / len(third_batch_author)}")

Fraction of disagreement about AI-relatedness in the first batch: 0.08
Fraction of disagreement at the hype level in the first batch: 0.15


Resolve disagreements between the author and the annotator

In [None]:
# solve the label disagreements between the two dataframes using the resolve_label_disagreements function
third_df_ai_level_merge = resolve_label_disagreements_AI(third_batch_author, third_batch_annotator)

# use the resolve_hype_disagreements function to resolve the hype disagreements between the two dataframes
df_final_third_batch = resolve_hype_disagreements(third_batch_author, third_df_ai_level_merge)

In [None]:
# verify the annotation process
print(f"Number of articles in the third sample: {len(df_final_third_batch)}")
print(f"columns in the third sample: {df_final_third_batch.columns}")
print(f"Number of articles with ai-related annotation: {df_final_third_batch['label_ai_related'].sum()}")


Number of articles in the third sample: 100
columns in the third sample: Index(['article_id', 'index_id', 'scanned_time', 'title', 'sub_title',
       'corpus', 'label_ai_related', 'hype_level', 'section', 'date',
       'modified', 'hype_level_change'],
      dtype='object')
Number of articles with ai-related annotation: 35


In [10]:
# write the final dataframe to a csv file
df_final_third_batch.to_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\third_batch_WSJ_final.csv", index=False) 

In [11]:
# verify the csv
df_final_third_batch = pd.read_csv(r"C:\Users\PC\Desktop\Masterarbeit\Code\WSJ\Annotation\third_batch_WSJ_final.csv")
print(f"Number of articles in the third sample: {len(df_final_third_batch)}")
print(f"columns in the third sample: {df_final_third_batch.columns}")

Number of articles in the third sample: 100
columns in the third sample: Index(['article_id', 'index_id', 'scanned_time', 'title', 'sub_title',
       'corpus', 'label_ai_related', 'hype_level', 'section', 'date',
       'modified', 'hype_level_change'],
      dtype='object')
