In [None]:
! pip install nltk

In [None]:
! pip install openpyxl

In [None]:
! pip install -U scikit-learn

In [None]:
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance
from nltk.metrics.distance import jaccard_distance
import pandas as pd
import os
from sklearn.metrics import jaccard_score
import numpy as np
import string

In [None]:
high_level_mapping = {
    'abbreviation without context':'lack of context',
    'unclear intention': 'lack of context',
    'write up': 'content generation',
    'generate persona': 'content generation',
    'exam reference': 'content generation',
    'generate question': 'content generation',
    'generate code template': 'content generation',
    'code': 'content generation',
    'generate test': 'content generation',
    'elaboration': 'content generation',
    'project assistance': 'content generation',
    'summarise topic': 'summarisation',
    'summarise literature': 'summarisation',
    'definition': 'understand concept',
    'understand': 'understand concept',
    'application': 'understand concept',
    'assistance in using application': 'it assistance',
    'technical documentation': 'it assistance',
    'set up': 'it assistance',
    'debugging': 'programming assistance',
    'code validation': 'programming assistance',
    'code suggestion': 'programming assistance',
    'writing refinement': 'language assistance',
    'sensitive writing': 'language assistance',
    'sutd-based information': 'search engine',
    'retrieve data': 'search engine',
    'find examples': 'search engine',
    'industry knowledge': 'search engine',
    'search engine query': 'search engine',
    'avoid plagiarism detection': 'academic dishonesty',
    'attempt to obtain answer for an assignment verbatim': 'academic dishonesty',
    'possible academic dishonesty': 'academic dishonesty',
    'sanity check': 'testing',
    'testing capabilities': 'testing',
    'gptlearn technical specification': 'clarify',
    'role-playing': 'prompt engineering',
    'establishing context': 'prompt engineering',
    'structured output format': 'prompt engineering',
    'idea validation': 'evaluation'
}

only_level1 = ['non-course related', 'follow up conversation', 'idea generation', 'malformed query', 'correcting model response', 'hallucination', 'oracle']

In [None]:
def modify_list(codes_list, code_dependency):
    codes_list = [item.lower() for item in codes_list if item]
    for code in codes_list:
        if code in code_dependency.values() or code in only_level1:
            pass
        elif code in code_dependency.keys():
            codes_list.append(code_dependency[code])
        else:
            print(f'Error: Unkown code -> {code} in the list {codes_list}')
    return list(set(codes_list))

all_data = []
for file in os.listdir('ENCODED'):
    if file.endswith('.xlsx') and not file.startswith('~'):
        print(file)
        df = pd.read_excel(os.path.join('ENCODED', file))
        df['quotation'] = df['quotation'].str.replace(f'[{string.punctuation}]', '', regex=True)
        df['quotation'] = df['quotation'].str.replace('\n', '')
        df['quotation'] = df['quotation'].str.replace(r'\s+', '', regex=True)
        df['quotation'] = df['quotation'].str.strip()
        df['quotation'] = df['quotation'].str.lower()
        df = df.sort_values(by=['document', 'quotation'])
        # Drop the unnecessary columns
        df = df.drop(columns=['comment'])

        df['codes'] = df['codes'].fillna('')
        df['codes'] = df['codes'].astype(str)
        df['codes'] = df['codes'].str.split(', ')
        df['codes'] = df['codes'].apply(lambda x: modify_list(x, high_level_mapping))
        df['codes'] = df['codes'].apply(sorted)
        all_data.append(df)

In [None]:
merged_df = all_data[0].merge(all_data[1], on=['document', 'quotation'], how='outer', suffixes=('_coder1', '_coder2'))
merged_df = merged_df.merge(all_data[2], on=['document', 'quotation'], how='outer')
merged_df['codes_coder3'] = merged_df['codes']
merged_df = merged_df.drop(columns=['codes'])
merged_df['codes_coder1'] = merged_df['codes_coder1'].fillna('[]').astype(str)
merged_df['codes_coder2'] = merged_df['codes_coder2'].fillna('[]').astype(str)
merged_df['codes_coder3'] = merged_df['codes_coder3'].fillna('[]').astype(str)


In [None]:
merged_df.sort_values(by=['document', 'quotation'])

In [None]:
task_data = []
for idx, row in merged_df.iterrows():
    try:
        for i in range(len(all_data)):
            person = f'codes_coder{i+1}'
            task_data.append((person, idx, frozenset(row[person])))
    except:
        for i in range(len(all_data)):
            person = f'codes_coder{i+1}'
            print(row[person], end=" ")
        print()
    
# Initialize AnnotationTask with masi_distance
task = AnnotationTask(distance=masi_distance)

# Load the task data
task.load_array(task_data)

# Compute Krippendorff's alpha
alpha = task.alpha()
print(f"Krippendorff's alpha: {alpha}")

In [None]:
def calculate_alpha(df):
    task_data = []
    for idx, row in df.iterrows():
        try:
            for i in range(len(all_data)):
                person = f'codes_coder{i+1}'
                task_data.append((person, idx, frozenset(row[person])))
        except:
            for i in range(len(all_data)):
                person = f'codes_coder{i+1}'
                print(row[person], end=" ")
            print()
        
    # Initialize AnnotationTask with masi_distance
    task = AnnotationTask(distance=masi_distance)

    # Load the task data
    task.load_array(task_data)

    # Compute Krippendorff's alpha
    alpha = task.alpha()
    print(f"Krippendorff's alpha: {alpha}")
    return alpha

In [None]:
criteria = 0.5
selected_rows = []
rejected_rows = []
for idx, row in merged_df.iterrows():

    coder1_labels = frozenset(row['codes_coder1'])
    coder2_labels = frozenset(row['codes_coder2'])
    coder3_labels = frozenset(row['codes_coder3'])

    distance1 = masi_distance(coder2_labels, coder1_labels)
    if distance1 > criteria:
        distance2 = masi_distance(coder2_labels, coder3_labels)
        if distance2 > criteria:
            distance3 = masi_distance(coder1_labels, coder3_labels)
            if distance3 < criteria-0.1:
                row['distance'] = (distance1+distance2)/2
                selected_rows.append(row)
                continue
    rejected_rows.append(row)
                
high_distance_df = pd.DataFrame(selected_rows)
low_distance_df = pd.DataFrame(rejected_rows)
assert high_distance_df.shape[0] + low_distance_df.shape[0] == merged_df.shape[0]
calculate_alpha(high_distance_df)
calculate_alpha(low_distance_df)
calculate_alpha(merged_df)

In [None]:
high_distance_df.to_excel("high_distance_df_daniel.xlsx")  