In [37]:
! pip install nltk



In [49]:
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance
import pandas as pd
import os
from sklearn.metrics import jaccard_score
import numpy as np

In [39]:
high_level_mapping = {
    'abbreviation without context':'lack of context',
    'unclear intention': 'lack of context',
    'write up': 'content generation',
    'generate persona': 'content generation',
    'exam reference': 'content generation',
    'generate question': 'content generation',
    'generate code template': 'content generation',
    'code': 'content generation',
    'generate test': 'content generation',
    'elaboration': 'content generation',
    'project assistance': 'content generation',
    'summarise topic': 'summarisation',
    'summarise literature': 'summarisation',
    'summarize topic': 'summarisation',
    'summarize literature': 'summarisation',
    'summarization': 'summarisation',
    'definition': 'understand concept',
    'understand': 'understand concept',
    'application': 'understand concept',
    'assistance in using application': 'it assistance',
    'technical documentation': 'it assistance',
    'set up': 'it assistance',
    'debugging': 'programming assistance',
    'code validation': 'programming assistance',
    'code suggestion': 'programming assistance',
    'writing refinement': 'language assistance',
    'sensitive writing': 'language assistance',
    'sutd-based information': 'search engine',
    'retrieve data': 'search engine',
    'find examples': 'search engine',
    'industry knowledge': 'search engine',
    'search engine query': 'search engine',
    'avoid plagiarism detection': 'academic dishonesty',
    'attempt to obtain answer for an assignment verbatim': 'academic dishonesty',
    'possible academic dishonesty': 'academic dishonesty',
    'sanity check': 'testing',
    'testing capabilities': 'testing',
    'gptlearn technical specification': 'clarify',
    'role-playing': 'prompt engineering',
    'establishing context': 'prompt engineering',
    'structured output format': 'prompt engineering',
    'idea validation': 'evaluation'
}

In [40]:
def modify_list(codes_list, code_dependency):
    codes_list = [code_dependency.get(item.lower(), item.lower()) for item in codes_list]
    codes_list = list(set(codes_list))
    # for key, value in code_dependency.items():
    #     if key in codes_list and value not in codes_list:
    #         codes_list.append(value)
    return codes_list

all_data = []
for file in os.listdir('ENCODED'):
    if file.endswith('.xlsx') and not file.startswith('~'):
        print(file)
        df = pd.read_excel(os.path.join('ENCODED', file))
        df = df.sort_values(by=['document', 'quotation'])
        df = df.drop(columns=['comment'])
        df['codes'] = df['codes'].fillna('')
        df['codes'] = df['codes'].astype(str)
        df = df.groupby('document').agg({
            'codes': ', '.join
        }).reset_index()
        df['codes'] = df['codes'].str.split(', ')
        df['codes'] = df['codes'].apply(lambda x: modify_list(x, high_level_mapping))
        df['codes'] = df['codes'].apply(sorted)
        all_data.append(df)

GPTLearn Coding - Daniel.xlsx
GPTLearn Coding - Ivan.xlsx
GPTLearn Coding - Keith.xlsx


In [41]:
for row in range(len(all_data[0]['document'])):
    for i in range(len(all_data)):
        for j in range(len(all_data)):
            assert all_data[i]['document'][row] == all_data[j]['document'][row]

In [42]:
merged_df = all_data[0].rename(columns={'codes': 'coder1'})

for idx, df in enumerate(all_data[1:], 2):  # Start from the second DataFrame and the second coder
    merged_df = merged_df.merge(df.rename(columns={'codes': f'coder{idx}'}), on='document', how='outer')

merged_df['coder1'] = merged_df['coder1'].fillna('[]')
merged_df = merged_df.set_index('document')
merged_df.head()

Unnamed: 0_level_0,coder1,coder2,coder3
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.docx,"[idea generation, prompt engineering, understa...",[content generation],[understand concept]
10.docx,"[academic dishonesty, non-course related]","[academic dishonesty, non-course related]","[academic dishonesty, non-course related]"
100.docx,"[it assistance, prompt engineering, understand...",[it assistance],"[it assistance, understand concept]"
1000.docx,"[non-course related, understand concept]",[testing],"[non-course related, understand concept]"
1001.docx,"[academic dishonesty, content generation, non-...","[academic dishonesty, content generation, non-...","[content generation, prompt engineering]"


In [43]:
# From CHAT GPT
def compute_agreement(codes1, codes2):
    all_labels = list(set(codes1) | set(codes2))
    bin_codes1 = [1 if label in codes1 else 0 for label in all_labels]
    bin_codes2 = [1 if label in codes2 else 0 for label in all_labels]
    return jaccard_score(bin_codes1, bin_codes2)

n_coders = len(all_data)
pairwise_agreement = np.zeros((n_coders, n_coders))
for i in range(n_coders):
    for j in range(n_coders):
        if i != j:
            total_agreement = 0
            for k in range(len(merged_df['coder1'])):
                total_agreement += compute_agreement(merged_df[f'coder{i+1}'][k], merged_df[f'coder{j+1}'][k])
            pairwise_agreement[i][j] = total_agreement / len(all_data[0])

alpha = np.mean(pairwise_agreement)

print(f"Krippendorff's alpha: {alpha}")

Krippendorff's alpha: 0.42098977739562454


In [44]:
task_data = []
highest = 0
for idx, row in merged_df.iterrows():
    try:
        for i in range(len(all_data)):
            person = f'coder{i+1}'
            task_data.append((person, idx, frozenset(row[person])))
    except:
        for i in range(len(all_data)):
            person = f'coder{i+1}'
            print(row[person], end=" ")
        print()
    
# Initialize AnnotationTask with masi_distance
task = AnnotationTask(distance=masi_distance)

# Load the task data
task.load_array(task_data)

# Compute Krippendorff's alpha
alpha = task.alpha()
print(f"Krippendorff's alpha: {alpha}")

Krippendorff's alpha: 0.5113675305345662


In [45]:
"""
Ivan&Keith
Krippendorff's alpha: 0.5131970923071444
Ivan&Daniel
Krippendorff's alpha: 0.5181746986583765
Keith&Daniel 
Krippendorff's alpha: 0.5410682126704474
"""

"\nIvan&Keith\nKrippendorff's alpha: 0.5131970923071444\nIvan&Daniel\nKrippendorff's alpha: 0.5181746986583765\nKeith&Daniel \nKrippendorff's alpha: 0.5410682126704474\n"

In [46]:
(0.5131970923071444 + 0.5181746986583765 + 0.5410682126704474)/3

0.524146667878656

In [47]:
# Continue from your previous code

# Compute MASI distance for each row and store them in a dictionary
masi_distances = {}

for idx, row in merged_df.iterrows():

    coder1_labels = frozenset(row['coder1'])
    coder2_labels = frozenset(row['coder2'])
    coder3_labels = frozenset(row['coder3'])
    
    distance = masi_distance(coder1_labels, coder2_labels)
    if distance == 1:
        print(idx)
        continue

    distance = masi_distance(coder1_labels, coder3_labels)
    if distance == 1:
        print(idx)
        continue

    distance = masi_distance(coder1_labels, coder3_labels)
    if distance == 1:
        print(idx)
        continue

1.docx
1000.docx
1005.docx
1008.docx
1009.docx
1010.docx
1011.docx
1012.docx
1013.docx
1014.docx
1015.docx
1016.docx
1017.docx
1018.docx
1019.docx
1020.docx
1021.docx
1022.docx
1023.docx
1024.docx
1025.docx
1026.docx
1027.docx
1028.docx
1029.docx
1031.docx
1032.docx
1033.docx
1034.docx
1035.docx
1036.docx
1038.docx
1039.docx
1040.docx
1041.docx
1042.docx
1043.docx
1044.docx
1045.docx
1046.docx
1047.docx
1048.docx
1053.docx
1056.docx
1057.docx
1058.docx
1071.docx
108.docx
1091.docx
1093.docx
1094.docx
1100.docx
1106.docx
111.docx
1118.docx
1120.docx
1121.docx
1123.docx
1141.docx
1142.docx
1145.docx
1146.docx
1158.docx
118.docx
119.docx
122.docx
123.docx
124.docx
125.docx
126.docx
129.docx
130.docx
131.docx
135.docx
137.docx
140.docx
146.docx
147.docx
148.docx
166.docx
167.docx
170.docx
171.docx
186.docx
187.docx
2.docx
224.docx
25.docx
255.docx
257.docx
258.docx
281.docx
300.docx
304.docx
311.docx
312.docx
313.docx
315.docx
321.docx
324.docx
325.docx
343.docx
344.docx
35.docx
364.docx
3

In [48]:
# Continue from your previous code

# Compute MASI distance for each row and store them in a dictionary
masi_distances = {}

for idx, row in merged_df.iterrows():

    coder1_labels = frozenset(row['coder1'])
    coder2_labels = frozenset(row['coder2'])
    coder3_labels = frozenset(row['coder3'])
    
    distance = masi_distance(coder1_labels, coder2_labels)
    if distance == 1:
        print(idx, row['coder1'], row['coder2'])

    distance = masi_distance(coder1_labels, coder3_labels)
    if distance == 1:
        print(idx, row['coder1'], row['coder3'])

    distance = masi_distance(coder1_labels, coder3_labels)
    if distance == 1:
        print(idx, row['coder2'], row['coder3'])

1.docx ['idea generation', 'prompt engineering', 'understand concept'] ['content generation']
1000.docx ['non-course related', 'understand concept'] ['testing']
1005.docx ['programming assistance', 'understand concept'] ['content generation']
1008.docx ['idea generation', 'lack of context'] ['content generation']
1009.docx ['idea generation', 'lack of context'] ['content generation']
1010.docx ['idea generation', 'lack of context'] ['content generation']
1011.docx ['idea generation', 'lack of context'] ['content generation']
1012.docx ['idea generation', 'lack of context'] ['content generation']
1013.docx ['idea generation', 'lack of context'] ['content generation']
1014.docx ['idea generation', 'lack of context'] ['content generation']
1015.docx ['idea generation', 'lack of context'] ['content generation']
1016.docx ['idea generation', 'lack of context'] ['content generation']
1017.docx ['idea generation', 'lack of context'] ['content generation']
1018.docx ['idea generation', 'lack o