In [None]:
! pip install nltk

In [None]:
! pip install openpyxl

In [None]:
! pip install -U scikit-learn

In [None]:
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance
from nltk.metrics.distance import jaccard_distance
import pandas as pd
import os
from sklearn.metrics import jaccard_score
import numpy as np
import string

In [None]:
high_level_mapping = {
    'abbreviation without context':'lack of context',
    'unclear intention': 'lack of context',
    'write up': 'content generation',
    'generate persona': 'content generation',
    'exam reference': 'content generation',
    'generate question': 'content generation',
    'generate code template': 'content generation',
    'code': 'content generation',
    'generate test': 'content generation',
    'elaboration': 'content generation',
    'project assistance': 'content generation',
    'summarise topic': 'summarisation',
    'summarise literature': 'summarisation',
    'definition': 'understand concept',
    'understand': 'understand concept',
    'application': 'understand concept',
    'assistance in using application': 'it assistance',
    'technical documentation': 'it assistance',
    'set up': 'it assistance',
    'debugging': 'programming assistance',
    'code validation': 'programming assistance',
    'code suggestion': 'programming assistance',
    'writing refinement': 'language assistance',
    'sensitive writing': 'language assistance',
    'sutd-based information': 'search engine',
    'retrieve data': 'search engine',
    'find examples': 'search engine',
    'industry knowledge': 'search engine',
    'search engine query': 'search engine',
    'avoid plagiarism detection': 'academic dishonesty',
    'attempt to obtain answer for an assignment verbatim': 'academic dishonesty',
    'possible academic dishonesty': 'academic dishonesty',
    'sanity check': 'testing',
    'testing capabilities': 'testing',
    'gptlearn technical specification': 'clarify',
    'role-playing': 'prompt engineering',
    'establishing context': 'prompt engineering',
    'structured output format': 'prompt engineering',
    'idea validation': 'evaluation'
}

only_level1 = ['non-course related', 'follow up conversation', 'idea generation', 'malformed query', 'correcting model response', 'hallucination', 'oracle']

In [None]:
def modify_list(codes_list, code_dependency):
    codes_list = [item.lower() for item in codes_list if item]
    for code in codes_list:
        if code in code_dependency.values() or code in only_level1:
            pass
        elif code in code_dependency.keys():
            codes_list.append(code_dependency[code])
        else:
            print(f'Error: Unkown code -> {code} in the list {codes_list}')
    return list(set(codes_list))

all_data = []
for file in os.listdir('ENCODED'):
    if file.endswith('.xlsx') and not file.startswith('~'):
        print(file)
        df = pd.read_excel(os.path.join('ENCODED', file))
        df['quotation'] = df['quotation'].str.replace(f'[{string.punctuation}]', '', regex=True)
        df['quotation'] = df['quotation'].str.replace(r'\s+', ' ', regex=True)
        df['quotation'] = df['quotation'].str.replace('\n', ' ')
        df['quotation'] = df['quotation'].str.strip()
        df = df.sort_values(by=['document', 'quotation'])
        # Drop the unnecessary columns
        df = df.drop(columns=['comment'])

        df['codes'] = df['codes'].fillna('')
        df['codes'] = df['codes'].astype(str)
        df['codes'] = df['codes'].str.split(', ')
        df['codes'] = df['codes'].apply(lambda x: modify_list(x, high_level_mapping))
        df['codes'] = df['codes'].apply(sorted)
        all_data.append(df)

In [None]:
merged_df = all_data[0].merge(all_data[1], on=['document', 'quotation'], how='outer', suffixes=('_coder1', '_coder2'))
merged_df = merged_df.merge(all_data[2], on=['document', 'quotation'], how='outer')
merged_df['codes_coder3'] = merged_df['codes']
merged_df = merged_df.drop(columns=['codes'])
merged_df['codes_coder1'] = merged_df['codes_coder1'].fillna('[]').astype(str)
merged_df['codes_coder2'] = merged_df['codes_coder2'].fillna('[]').astype(str)
merged_df['codes_coder3'] = merged_df['codes_coder3'].fillna('[]').astype(str)


In [None]:
merged_df.sort_values(by=['document', 'quotation'])

In [None]:
task_data = []
for idx, row in merged_df.iterrows():
    try:
        for i in range(len(all_data)):
            person = f'codes_coder{i+1}'
            task_data.append((person, idx, frozenset(row[person])))
    except:
        for i in range(len(all_data)):
            person = f'codes_coder{i+1}'
            print(row[person], end=" ")
        print()
    
# Initialize AnnotationTask with masi_distance
task = AnnotationTask(distance=masi_distance)

# Load the task data
task.load_array(task_data)

# Compute Krippendorff's alpha
alpha = task.alpha()
print(f"Krippendorff's alpha: {alpha}")

In [None]:
criteria = 0.7
selected_rows = []
for idx, row in merged_df.iterrows():

    coder1_labels = frozenset(row['codes_coder1'])
    coder2_labels = frozenset(row['codes_coder2'])
    coder3_labels = frozenset(row['codes_coder3'])
    
    distance = masi_distance(coder1_labels, coder2_labels)
    if distance < criteria:
        distance = masi_distance(coder1_labels, coder3_labels)
        if distance < criteria:
            distance = masi_distance(coder1_labels, coder3_labels)
            selected_rows.append(row)
low_distance_df = pd.DataFrame(selected_rows)

In [None]:
task_data = []
for idx, row in low_distance_df.iterrows():
    try:
        for i in range(len(all_data)):
            person = f'codes_coder{i+1}'
            task_data.append((person, idx, frozenset(row[person])))
    except:
        for i in range(len(all_data)):
            person = f'codes_coder{i+1}'
            print(row[person], end=" ")
        print()
    
# Initialize AnnotationTask with masi_distance
task = AnnotationTask(distance=masi_distance)

# Load the task data
task.load_array(task_data)

# Compute Krippendorff's alpha
alpha = task.alpha()
print(f"Krippendorff's alpha: {alpha}")

In [None]:
low_distance_df

In [None]:
low_distance_df.to_excel("low_distance_df.xlsx")  

In [None]:
selected_rows = []
for idx, row in merged_df.iterrows():

    coder1_labels = frozenset(row['codes_coder1'])
    coder2_labels = frozenset(row['codes_coder2'])
    coder3_labels = frozenset(row['codes_coder3'])
    
    distance = masi_distance(coder1_labels, coder2_labels)
    if distance >= criteria:
        distance = masi_distance(coder1_labels, coder3_labels)
        if distance >= criteria:
            distance = masi_distance(coder1_labels, coder3_labels)
            row['distance'] = distance
            selected_rows.append(row)
high_distance_df = pd.DataFrame(selected_rows)

In [None]:
high_distance_df

In [None]:
high_distance_df.to_excel("high_distance_df.xlsx")  

In [None]:
selected_rows = []
for idx, row in merged_df.iterrows():
    temp = [row['codes_coder1'], row['codes_coder2'], row['codes_coder3']]
    if '[]' in temp:
        selected_rows.append(row)
missing_distance_df = pd.DataFrame(selected_rows)

In [None]:
missing_distance_df

In [None]:
missing_distance_df.to_excel("missing_distance_df.xlsx")  

In [None]:
unique = list(set(list(missing_distance_df['document'])))
unique.sort()

In [None]:
import nltk
from nltk.metrics import agreement
from nltk.metrics.distance import masi_distance
from nltk.metrics.distance import jaccard_distance

#(coder, item, label)
data = [('inky','text01',frozenset(['love','gifts'])), 
      ('blinky','text01',frozenset([''])), 
      ('sue','text01',frozenset(['love','gifts'])), 
      ('inky','text02',frozenset(['slime','gaming'])), 
      ('blinky','text02',frozenset([''])), 
      ('sue','text02',frozenset(['slime','gaming']))]

jaccard_task = nltk.AnnotationTask(distance=jaccard_distance)
masi_task = nltk.AnnotationTask(distance=masi_distance)
tasks = [jaccard_task, masi_task]
for task in tasks:
    task.load_array(data)
    print("Statistics for dataset using {}".format(task.distance))
    print("C: {}\nI: {}\nK: {}".format(task.C, task.I, task.K))
    print("Pi: {}".format(task.pi()))
    print("Kappa: {}".format(task.kappa()))
    print("Multi-Kappa: {}".format(task.multi_kappa()))
    print("Alpha: {}".format(task.alpha()))
    print()

In [None]:
import nltk
from nltk.metrics import agreement
from nltk.metrics.distance import masi_distance
from nltk.metrics.distance import jaccard_distance

#(coder, item, label)
data = [('inky','text01',frozenset(['love','gifts'])), 
      ('blinky','text01',frozenset(['love','gifts'])), 
      ('sue','text01',frozenset(['love','gifts'])), 
      ('inky','text02',frozenset(['slime','gaming'])), 
      ('blinky','text02',frozenset([""])), 
      ('sue','text02',frozenset(['slime','gaming'])),
      ('inky','text03',frozenset(['love','gifts'])), 
      ('blinky','text03',frozenset(['love','gifts'])), 
      ('sue','text03',frozenset(['love','gifts'])), 
      ('inky','text04',frozenset(['love','gifts'])), 
      ('blinky','text04',frozenset(['love','gifts'])), 
      ('sue','text04',frozenset(['love','gifts'])), ]

jaccard_task = nltk.AnnotationTask(distance=jaccard_distance)
masi_task = nltk.AnnotationTask(distance=masi_distance)
tasks = [jaccard_task, masi_task]
for task in tasks:
    task.load_array(data)
    print("Statistics for dataset using {}".format(task.distance))
    print("C: {}\nI: {}\nK: {}".format(task.C, task.I, task.K))
    print("Pi: {}".format(task.pi()))
    print("Kappa: {}".format(task.kappa()))
    print("Multi-Kappa: {}".format(task.multi_kappa()))
    print("Alpha: {}".format(task.alpha()))
    print()