In [51]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, confusion_matrix

In [2]:
# Load the JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

In [28]:
# Convert JSON to DataFrame, keeping only the last tag for each word
def json_to_df(data):
    annotations = []
    for item in data:
        # Dictionary to store the last tag for each word in the sentence
        word_to_last_tag = {}
        for annotation in item['annotations']:
            for result in annotation['result']:
                word = result['value']['text']
                pos_tag = result['value']['labels'][0]
                # Update the last tag for the word
                word_to_last_tag[word] = pos_tag

        # Add the last tag for each word to the annotations list
        for annotation in item['annotations']:
            for result in annotation['result']:
                word = result['value']['text']
                if word in word_to_last_tag:
                    annotations.append({
                        'id': item['id'],
                        'text': item['data']['text'],
                        'start': result['value']['start'],
                        'end': result['value']['end'],
                        'word': word,
                        'pos_tag': word_to_last_tag[word]
                    })
                    # Remove the word from the dictionary to avoid duplicates
                    del word_to_last_tag[word]
    return pd.DataFrame(annotations)

In [53]:
def preprocess_text(text):
    return text.replace('-', ' ')

# Reindex the id column to range from 1 to 20
def reindex_id_column(df):
    unique_ids = df['id'].unique()
    id_mapping = {old_id: new_id for new_id, old_id in enumerate(unique_ids, start=1)}
    df['id'] = df['id'].map(id_mapping)
    return df

# Merge the two DataFrames based on id, start, and end
def merge_dataframes(df1, df2):
    merged_df = pd.merge(df1, df2, on=['id', 'start', 'end'], suffixes=('_annotator1', '_annotator2'))
    return merged_df


In [44]:
# Load the JSON files
file1 = 'nlp_laksh.json'
file2 = 'nlp_tanish.json'

data1 = load_json(file1)
data2 = load_json(file2)

df1 = json_to_df(data1)
df2 = json_to_df(data2)

df1 = reindex_id_column(df1)
df2 = reindex_id_column(df2)

In [46]:
merged_df = merge_dataframes(df1, df2)
display(merged_df)


Unnamed: 0,id,text_annotator1,start,end,word_annotator1,pos_tag_annotator1,text_annotator2,word_annotator2,pos_tag_annotator2
0,1,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,0,8,Ayodhya,PROPN,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,Ayodhya,PROPN
1,1,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,8,12,Ram,PROPN,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,Ram,PROPN
2,1,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,12,19,Mandir,NOUN,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,Mandir,NOUN
3,1,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,19,24,Pran,NOUN,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,Pran,NOUN
4,1,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,24,34,Pratishtha,NOUN,Ayodhya Ram Mandir Pran Pratishtha: मोहन भागवत...,Pratishtha,NOUN
...,...,...,...,...,...,...,...,...,...
291,20,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,30,36,प्राण,NOUN,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,प्राण,NOUN
292,20,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,36,46,प्रतिष्ठा,NOUN,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,प्रतिष्ठा,NOUN
293,20,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,46,50,में,ADP,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,में,ADP
294,20,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,50,57,जाएंगे,VERB,Ayodhya Ram Mandir: रामलला की प्राण प्रतिष्ठा ...,जाएंगे,VERB


In [55]:
def calculate_cohens_kappa(merged_df):
  pos1 = merged_df['pos_tag_annotator1']
  pos2 = merged_df['pos_tag_annotator2']

  # Create confusion matrix
  labels = ["NOUN", "PROPN", "VERB", "ADJ", "ADV", "ADP", "PRON", "DET", "CONJ", "PART", "PRON_WH", "PART_NEG", "NUM", "X"]
  conf_matrix = confusion_matrix(pos1, pos2, labels=labels)
  print("Confusion Matrix:")
  print(conf_matrix)

  # Observed agreement (p_o)
  diagonal_sum = np.trace(conf_matrix)
  total_observations = np.sum(conf_matrix)
  p_o = diagonal_sum / total_observations
  print(f"Observed Agreement (p_o): {p_o}")

  # Expected agreement (p_e)
  row_totals = np.sum(conf_matrix, axis=1)
  col_totals = np.sum(conf_matrix, axis=0)
  p_e = np.sum(row_totals * col_totals) / (total_observations ** 2)
  print(f"Expected Agreement (p_e): {p_e}")

  # Calculating Cohen's Kappa
  kappa = (p_o - p_e) / (1 - p_e)
  return kappa

In [56]:
# Calculate Cohen's Kappa
kappa_score = calculate_cohens_kappa(merged_df)
print(f"Cohen's Kappa: {kappa_score}")

Confusion Matrix:
[[87  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 86  1  0  0  0  0  0  0  0  0  0  0  1]
 [ 0  1 26  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  8  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  4  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 39  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 30]]
Observed Agreement (p_o): 0.9864864864864865
Expected Agreement (p_e): 0.211479638422206
Cohen's Kappa: 0.9828621882553882
