<a href="https://colab.research.google.com/github/GregoryG3/Thesis/blob/main/INITIALIZATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INITIALIZATION

# Importing necessary libraries

In [1]:
!pip install python-docx
!pip install krippendorff

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Collecting krippendorff
  Downloading krippendorff-0.6.1-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.6.1


In [43]:
import pandas as pd
import numpy as np
from datetime import datetime

from google.colab import drive
from os.path import join

import re
from scipy.stats import describe
import krippendorff
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

from matplotlib import pyplot as plt
import seaborn as sns

from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement

# import random
# import os


# from transformers import (AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification,
#                           BertTokenizer, AdamW, Trainer, TrainingArguments, XLNetTokenizer, XLNetForSequenceClassification,
#                           RobertaTokenizer, RobertaForSequenceClassification, AutoModelForSeq2SeqLM, pipeline)
# import torch
# from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from tqdm import tqdm

# from sklearn.metrics import (f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,
                            #  accuracy_score, precision_score, recall_score)
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder

# import string
# import nltk
# from nltk.util import ngrams
# from nltk.corpus import stopwords
# from collections import Counter
# from bs4 import BeautifulSoup


# from matplotlib.lines import Line2D
# from wordcloud import WordCloud
# # from PIL import Image

# Load data

In [4]:
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/Thesis/Data"
FEEDBACK_DIR = join(PROJECT_DIR, "Feedback data")
PROCESSED_DIR = join(PROJECT_DIR, "Processed")

Mounted at /content/drive


Loading the dataset with all transaction data:


In [None]:
sr_full_feedback = pd.read_csv(join(FEEDBACK_DIR,"sr_full_feedback_data.csv"), encoding='latin1')
ab_full_feedback = pd.read_csv(join(FEEDBACK_DIR,"ab_full_feedback_data.csv"), encoding='latin1')
ha_full_feedback = pd.read_csv(join(FEEDBACK_DIR,"ha_full_feedback_data.csv"), encoding='latin1')

Loading the dataset with selected transaction data:

In [4]:
sr_text_feedback = pd.read_csv(join(FEEDBACK_DIR,"sr_selected_feedback_data.csv"), encoding='latin1')
ab_text_feedback = pd.read_csv(join(FEEDBACK_DIR,"ab_selected_feedback_data.csv"), encoding='latin1')
ha_text_feedback = pd.read_csv(join(FEEDBACK_DIR,"ha_selected_feedback_data.csv"), encoding='latin1')

Loading the dataset with manually coded
 data:



In [5]:
manually_coded = pd.read_csv(join(PROJECT_DIR,"Annotation data/manual_coding_majority_vote_2000_texts.csv"), encoding='latin1')
annotation_records = pd.read_csv(join(PROJECT_DIR,"Annotation data/manual_coding_6000_texts.csv"), encoding='latin1')

# 1 DESCRIPTIVE STATISTICS

## FULL FEEDBACK

**Share of feedback with textual data**

Explore whether the feedback is default text or have NaN **values**:


In [None]:
# Define default text lists
sr_default_text = [
    "Leave feedback here.",
    "Leave feedback here",
    "Leave Feedback Here",
    "Leave Feedback here",
    "leave feedback here",
    "leave feedback here.",
    "Leave feedback here,",
    "Leave feedback here!",
    "Leave feedback"
]
ab_default_text = ["No comment"]
ha_default_text = ["---", "--", "-"]

# Count NaN values in 'comment'
sr_nan_count = sr_full_feedback['comment'].isna().sum()
ab_nan_count = ab_full_feedback['comment'].isna().sum()
ha_nan_count = ha_full_feedback['comment'].isna().sum()

print(f"""-----------------------
Number of NaN values in Silk Road comments: {sr_nan_count}
Number of NaN values in AlphaBay comments: {ab_nan_count}
Number of NaN values in Hansa comments: {ha_nan_count}
-----------------------""")

# Apply default text conditions
sr_full_feedback['default_text'] = np.where(sr_full_feedback['comment'].isin(sr_default_text), True, False)
ab_full_feedback['default_text'] = np.where(ab_full_feedback['comment'].isin(ab_default_text), True, False)
ha_full_feedback['default_text'] = np.where(ha_full_feedback['comment'].isin(ha_default_text), True, False)

sr_default_com_count = sr_full_feedback['default_text'].sum()
ab_default_com_count = ab_full_feedback['default_text'].sum()
ha_default_com_count = ha_full_feedback['default_text'].sum()

print(f"""-----------------------
Number of default comments in Silk Road: {sr_default_com_count}
Number of default comments in AlphaBay: {ab_default_com_count}
Number of default comments in Hansa: {ha_default_com_count}
-----------------------""")

# Calculate proportions of non-NaN and False default_text
sr_total_count = len(sr_full_feedback)
ab_total_count = len(ab_full_feedback)
ha_total_count = len(ha_full_feedback)

prop_valid_non_default_sr = round(((sr_full_feedback['comment'].notna()) & (sr_full_feedback['default_text'] == False)).sum() / sr_total_count * 100, 2)
prop_valid_non_default_ab = round(((ab_full_feedback['comment'].notna()) & (ab_full_feedback['default_text'] == False)).sum() / ab_total_count * 100, 2)
prop_valid_non_default_ha = round(((ha_full_feedback['comment'].notna()) & (ha_full_feedback['default_text'] == False)).sum() / ha_total_count * 100, 2)

print(f"""-----------------------
Proportion of valid non-default comments in Silk Road: {prop_valid_non_default_sr}%
Proportion of valid non-default comments in AlphaBay: {prop_valid_non_default_ab}%
Proportion of valid non-default comments in Hansa: {prop_valid_non_default_ha}%
-----------------------""")

-----------------------
Number of NaN values in Silk Road comments: 1700
Number of NaN values in AlphaBay comments: 9
Number of NaN values in Hansa comments: 0
-----------------------
-----------------------
Number of default comments in Silk Road: 3458
Number of default comments in AlphaBay: 439714
Number of default comments in Hansa: 16246
-----------------------
-----------------------
Proportion of valid non-default comments in Silk Road: 97.21%
Proportion of valid non-default comments in AlphaBay: 81.38%
Proportion of valid non-default comments in Hansa: 62.08%
-----------------------


**Share of pos/neu/neg transactions**

In [None]:
# Recode ratings
sr_full_feedback['rating_recoded'] = np.where(sr_full_feedback['rating'] == 5, 'Positive',
                                              np.where(sr_full_feedback['rating'] == 1, 'Negative', 'Neutral'))
ha_full_feedback['rating_recoded'] = np.where(ha_full_feedback['rating'] == 'success', 'Positive',
                                              np.where(ha_full_feedback['rating'] == 'danger', 'Negative', 'Neutral'))

# Calculate proportions of ratings
prop_rating_sr = round(sr_full_feedback['rating_recoded'].value_counts(normalize=True) * 100, 2)
prop_rating_ab = round(ab_full_feedback['rating'].value_counts(normalize=True) * 100, 2)
prop_rating_ha = round(ha_full_feedback['rating_recoded'].value_counts(normalize=True) * 100, 2)

# Prepare table
colnames_table = ["SilkRoad", "AlphaBay", "Hansa"]
rownames_table = ["Positive", "Neutral", "Negative", "With text", "(N =)"]

rows1_3 = pd.concat([prop_rating_sr, prop_rating_ab, prop_rating_ha], axis=1)
rows1_3.columns = colnames_table
rows1_3 = rows1_3.loc[['Negative', 'Neutral', 'Positive'], :]

row4 = pd.DataFrame({
    "SilkRoad": [prop_valid_non_default_sr],
    "AlphaBay": [prop_valid_non_default_ab],
    "Hansa": [prop_valid_non_default_ha]
}, index=["With text"])

count_row5 = pd.DataFrame({
    "SilkRoad": [((sr_full_feedback['comment'].notna()) & (sr_full_feedback['default_text'] == False)).sum()],
    "AlphaBay": [((ab_full_feedback['comment'].notna()) & (ab_full_feedback['default_text'] == False)).sum()],
    "Hansa": [((ha_full_feedback['comment'].notna()) & (ha_full_feedback['default_text'] == False)).sum()]
}, index=["(N =)"])

table = pd.concat([rows1_3, row4,count_row5])

# Apply the percentage format to the first four rows
table.iloc[:4] = table.iloc[:4].applymap(lambda x: f"{x}%" if pd.notnull(x) else x)

# Format the fifth row with thousand separators
table.iloc[4] = table.iloc[4].apply(lambda x: f"{x:,.0f}".replace(",", " ") if pd.notnull(x) else x)

table.insert(0, 'Feedbacks', rownames_table)

table

Unnamed: 0,Feedbacks,SilkRoad,AlphaBay,Hansa
Negative,Positive,1.11%,2.56%,1.07%
Neutral,Neutral,2.39%,1.21%,2.42%
Positive,Negative,96.5%,96.23%,96.52%
With text,With text,97.21%,81.38%,62.08%
(N =),(N =),179 593,1 921 591,26 597


Creating table

In [None]:
# Create a Word document
doc = Document()
table_doc = doc.add_table(rows=table.shape[0]+1, cols=table.shape[1])

# Add headers
hdr_cells = table_doc.rows[0].cells
for i, column_name in enumerate(table.columns):
    hdr_cells[i].text = column_name

# Add the data
for i, row in enumerate(table.itertuples(), start=1):
    row_cells = table_doc.rows[i].cells
    for j, value in enumerate(row[1:], start=0):  # start=1 to skip the Index column
        row_cells[j].text = str(value)

# Style the table
for row in table_doc.rows:
    for cell in row.cells:
        cell.text = cell.text

# Save the document
doc.save(join(PROJECT_DIR, "Results/feedback_proportions.docx"))

## SELECTED FEEDBACK

Check if we have Null values in commnents:

In [6]:
# Count NaN values in 'comment'
sr_nan_c = sr_text_feedback['comment'].isna().sum()
ab_nan_c = ab_text_feedback['comment'].isna().sum()
ha_nan_c = ha_text_feedback['comment'].isna().sum()

print(f"""-----------------------
Number of NaN values in Silk Road comments: {sr_nan_c}
Number of NaN values in AlphaBay comments: {ab_nan_c}
Number of NaN values in Hansa comments: {ha_nan_c}
-----------------------""")

-----------------------
Number of NaN values in Silk Road comments: 2
Number of NaN values in AlphaBay comments: 382
Number of NaN values in Hansa comments: 0
-----------------------


In [7]:
sr_text_feedback = sr_text_feedback.loc[sr_text_feedback['comment'].notna()]
ab_text_feedback = ab_text_feedback.loc[ab_text_feedback['comment'].notna()]

Determine observations containing symbols only:


In [8]:
# function that checks whether a comment contains letters
def symbols_only(comment):
    if isinstance(comment, str) and re.search("[A-Za-z]+", comment):
        return "no"
    else:
        return "yes"

sr_text_feedback['symbols_only'] = sr_text_feedback['comment'].apply(symbols_only)
ab_text_feedback['symbols_only'] = ab_text_feedback['comment'].apply(symbols_only)
ha_text_feedback['symbols_only'] = ha_text_feedback['comment'].apply(symbols_only)

For SilkRoad, removing 3 feedbacks of extreme length that just repeatedly
state "SCAMMER SCAMMER":

Ana delete only indices: (65322, 65316) [minus 1 since R count from 1], but  decided to delete also this text: "SCAMMER SCAMMER SCAMMER"


In [9]:
indices_to_remove = ['sr67783', 'sr67776', 'sr67782']
sr_text_feedback = sr_text_feedback.drop(sr_text_feedback[sr_text_feedback['global_id'].isin(indices_to_remove)].index)

Get the statistics:


In [10]:
datasets = ["sr_text_feedback", "ab_text_feedback", "ha_text_feedback"]
abb = ["sr", "ab", "ha"]

# Function to check if the comment contains letters
def symbols_only(comment):
    if isinstance(comment, str) and re.search("[A-Za-z]+", comment):
        return "no"
    else:
        return "yes"

# Processing datasets
for i, dataset_name in enumerate(datasets):
    dataset = locals()[dataset_name]
    filtered_dataset = dataset.loc[dataset['symbols_only'] == "no"].copy()

    # Number of words in the comment
    filtered_dataset['n_words'] = filtered_dataset['comment'].apply(lambda x: len(re.findall(r'\b\w+\b', x)))

    # len(text.split())

    # Number of characters with punctuation
    filtered_dataset['char_punct'] = filtered_dataset['comment'].apply(lambda x: len(re.findall(r"[^ ]", x)))

    # Number of characters without punctuation
    filtered_dataset['char_no_punct'] = filtered_dataset['comment'].apply(lambda x: len(re.findall(r"[a-zA-Z0-9]", x)))

    # Number of punctuation characters
    filtered_dataset['punct'] = filtered_dataset['char_punct'] - filtered_dataset['char_no_punct']

    text_stats = filtered_dataset[['global_id', 'comment', 'rating', 'n_words', 'char_punct', 'char_no_punct', 'punct']]

    locals()[f"{abb[i]}_text_statistics"] = text_stats


In [11]:
def calculate_descriptives(dataframe, columns_to_exclude):
    columns = [col for col in dataframe.columns if col not in columns_to_exclude]
    data = dataframe[columns]
    results = {
        "Mean": data.mean(),
        "SD": data.std(),
        "Median": data.median(),
        "Min": data.min(),
        "Max": data.max()
    }
    desc_df = pd.DataFrame(results).T.round(2)
    desc_df.columns = columns
    return desc_df.T

columns_to_exclude = ['char_no_punct']


sr_descriptives = calculate_descriptives(sr_text_statistics.iloc[:, 3:7], columns_to_exclude)
ab_descriptives = calculate_descriptives(ab_text_statistics.iloc[:, 3:7], columns_to_exclude)
ha_descriptives = calculate_descriptives(ha_text_statistics.iloc[:, 3:7], columns_to_exclude)

Determin number of rows in each dataset:

In [12]:
sr_nrows = sr_text_feedback.shape[0]
ab_nrows = ab_text_feedback.shape[0]
ha_nrows = ha_text_feedback.shape[0]

print(f"""-----------------------
Number of values in Silk Road: {sr_nrows}
Number of values in AlphaBay: {ab_nrows}
Number of  values in Hansa: {ha_nrows}
-----------------------""")

-----------------------
Number of values in Silk Road: 178101
Number of values in AlphaBay: 1703887
Number of  values in Hansa: 25738
-----------------------


In [13]:
# Define the column names for descriptives
colnames_descriptives = ["Mean", "SD", "Median", "Min", "Max"]

# Create a DataFrame for the colnames_descriptives
descriptives_header = pd.DataFrame([colnames_descriptives],
                                   columns=colnames_descriptives)

# Define the first column data
first_col = [
    "N words",
    "N char (no punct.)",
    "N punct. char",
    "AlphaBay (n = 1 703 887)",
    "N words",
    "N char (no punct.)",
    "N punct. char",
    "Hansa (n = 25 738)",
    "N words",
    "N char (no punct.)",
    "N punct. char"
]

# Combine the data frames row-wise
combined_data = pd.concat([
    sr_descriptives,
    descriptives_header,
    ab_descriptives,
    descriptives_header,
    ha_descriptives
], ignore_index=True)

# Add the first column to the combined data frame
combined_data.insert(0, "First Col", first_col)

# Define the column names for the final table
colnames_table_2 = ["SilkRoad (n = 178 101)", "Mean", "SD",
                    "Median", "Min", "Max"]

# Assign the column names to the data frame
combined_data.columns = colnames_table_2

In [14]:
combined_data

Unnamed: 0,SilkRoad (n = 178 101),Mean,SD,Median,Min,Max
0,N words,12.66,16.64,8.0,1.0,992.0
1,N char (no punct.),61.4,72.79,43.0,1.0,4237.0
2,N punct. char,3.31,4.42,2.0,0.0,548.0
3,AlphaBay (n = 1 703 887),Mean,SD,Median,Min,Max
4,N words,9.97,9.27,7.0,1.0,533.0
5,N char (no punct.),47.19,40.91,36.0,1.0,1129.0
6,N punct. char,2.64,3.29,2.0,0.0,1056.0
7,Hansa (n = 25 738),Mean,SD,Median,Min,Max
8,N words,9.65,12.68,6.0,1.0,268.0
9,N char (no punct.),46.88,57.28,30.0,1.0,1848.0


Create the table:


In [15]:
# Create a new Word document
doc = Document()

# Add a table to the document
table = doc.add_table(rows=1, cols=len(combined_data.columns))

# Add the header row
hdr_cells = table.rows[0].cells
headers = combined_data.columns

for i, header in enumerate(headers):
    hdr_cells[i].text = header
    paragraph = hdr_cells[i].paragraphs[0]
    paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
    run = paragraph.runs[0]
    run.font.bold = True
    run.font.name = 'Times New Roman'
    run.font.size = Pt(12)

# Add the data rows
for idx, row in combined_data.iterrows():
    cells = table.add_row().cells
    for i, header in enumerate(headers):
        value = row[header]
        cells[i].text = str(value) if pd.notna(value) else ""
        paragraph = cells[i].paragraphs[0]
        paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
        run = paragraph.runs[0]
        run.font.name = 'Times New Roman'
        run.font.size = Pt(12)

# Apply bold to specific rows
bold_rows = [4, 8]
for row_idx in bold_rows:
    if row_idx < len(table.rows):
        row = table.rows[row_idx]
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.bold = True

# Apply border to specific rows
border_rows = [4, 8]
for row_idx in border_rows:
    if row_idx < len(table.rows):
        row = table.rows[row_idx]
        for cell in row.cells:
            tc = cell._element
            tcPr = tc.get_or_add_tcPr()
            for border in ["top", "bottom"]:
                el = OxmlElement(f'w:{border}')
                el.set(qn('w:val'), 'single')
                el.set(qn('w:sz'), '8')
                el.set(qn('w:space'), '0')
                el.set(qn('w:color'), '000000')
                tcPr.append(el)

# Add footnote
doc.add_paragraph().add_run().add_break()
footnote = doc.add_paragraph(
    "The n of transactions includes texts with punctuation only, but these are excluded from the descriptive statistics for clarity."
)
footnote.add_run("1").superscript = True

# Save the document
doc.save("text_descriptives.docx")

### Dataset for prediction

Get the dataset for the final prediction (exclude manually coded sample):

In [16]:
# Anti-join
ab_for_prediction = ab_text_feedback[~ab_text_feedback['global_id'].isin(manually_coded['global_id'])]

# Tidy up the rating terminology
sr_text_feedback['rating'] = sr_text_feedback['rating'].apply(lambda x: 'Negative' if x == 1 else ('Positive' if x == 5 else 'Neutral'))

ha_text_feedback['rating'] = ha_text_feedback['rating'].apply(lambda x: 'Negative' if x == 'danger' else ('Positive' if x == 'success' else 'Neutral'))

# # Tidy up the dates
ab_for_prediction.loc[:, 'date_left'] = pd.to_datetime(ab_for_prediction['date_left'].str.slice(stop=-6), format='%b %d, %Y', errors='coerce').dt.date

# Combine datasets
prediction_dataset = pd.concat([sr_text_feedback, ab_for_prediction, ha_text_feedback], ignore_index=True)

Save the file

In [17]:
prediction_dataset.to_csv(join(PROCESSED_DIR, "data_for_prediction.csv"), index=False)

## TOPICS OF MANUAL CODED DATA

Let's check the size of our topics in manual dataset

In [5]:
filter_col = [col for col in manually_coded if col.startswith('t_')]
filter_col2 = [col for col in manually_coded if col.startswith('m_')]

In [7]:
manually_coded[filter_col2].sum()

m_help_other_buyer     297
m_avoid_harm_seler      89
m_help_seller           78
m_rew_pun_seller       299
m_reach_seller          74
m_express_emo          530
m_share_facts         1396
m_other                128
dtype: int64

In [8]:
manually_coded[filter_col].sum()

t_communication     354
t_payment            75
t_refund             96
t_price              52
t_value             112
t_shipping         1016
t_product           889
t_feedback           60
t_vendor            343
t_generic           126
t_overall           258
t_other              66
dtype: int64

# 2 Calculating the IRR


## Analyzing the distribution of raters per text

In [49]:
print(f'Number of unique participant label:', annotation_records['participant.label'].nunique())
print(f'Number of unique participant code:', annotation_records['participant.code'].nunique())
print(f'Number of unique player email:', annotation_records['player.email'].nunique())

Number of unique participant label: 22
Number of unique participant code: 99
Number of unique player email: 35


In [61]:
sorted(annotation_records['participant.label'].unique())

['annotator_a1',
 'annotator_a10',
 'annotator_a2',
 'annotator_a3',
 'annotator_a4',
 'annotator_a5',
 'annotator_a6',
 'annotator_a7',
 'annotator_a8',
 'annotator_a9',
 'annotator_ana',
 'annotator_b1',
 'annotator_b10',
 'annotator_b11',
 'annotator_b2',
 'annotator_b3',
 'annotator_b4',
 'annotator_b5',
 'annotator_b6',
 'annotator_b7',
 'annotator_b9',
 'annotator_wojtek']

In [84]:
sorted(annotation_records['player.email'].unique())
# I guess that '07554973136' and '7554973136'is the same person

['07554973136',
 '7554973136',
 'AIperson',
 'DEITY',
 'F3456',
 'Haasje',
 'Iri',
 'JensKlooster',
 'Joy',
 'Kiran_SU31',
 'Mijke',
 'Nscarlet',
 'OH.',
 'Ossie',
 'SofieL',
 'SpilkoDinkov',
 'ana',
 'annotator_a1',
 'annotator_a10',
 'annotator_a7',
 'blowfish',
 'cvdm',
 'dolphin',
 'egel',
 'gadgetnr6',
 'itsme',
 'jeansinnott',
 'jessicaserena',
 'kreaseus',
 'marijana',
 'onlineresearch1',
 'phoenix23',
 'poalaaj',
 'relaxedtree',
 'wojtek']

Let's check overlapping:

In [62]:
unique_assignments = annotation_records[['participant.label', 'player.email']].drop_duplicates().sort_values(by = 'participant.label')
unique_assignments

Unnamed: 0,participant.label,player.email
4556,annotator_a1,JensKlooster
58,annotator_a1,egel
3773,annotator_a1,itsme
4739,annotator_a1,marijana
3029,annotator_a1,DEITY
...,...,...
1931,annotator_b7,jeansinnott
467,annotator_b7,Kiran_SU31
891,annotator_b7,egel
453,annotator_b9,Joy


No charts were generated by quickchart


In [82]:
unique_assignments_email = annotation_records[['player.email', 'participant.label']].drop_duplicates().sort_values(by = 'player.email')
unique_assignments_email

Unnamed: 0,player.email,participant.label
5615,07554973136,annotator_a2
890,7554973136,annotator_b6
889,7554973136,annotator_b5
886,7554973136,annotator_b1
2081,AIperson,annotator_a5
...,...,...
515,onlineresearch1,annotator_b1
1419,phoenix23,annotator_a4
2080,poalaaj,annotator_a3
530,relaxedtree,annotator_b3


 Let's check overlapping raters and emails for texts with exactly two unique raters:

In [7]:
# Filter texts rated by exactly two unique raters
text_rater_count = annotation_records.groupby('player.global_id')['participant.label'].nunique()
overlapping_texts = text_rater_count[text_rater_count == 2].index

# Get records with overlapping raters
overlapping_records = annotation_records[annotation_records['player.global_id'].isin(overlapping_texts)]

# Create lists of raters and emails
overlapping_raters = overlapping_records.groupby('player.global_id')['participant.label'].apply(list)
overlapping_emails = overlapping_records.groupby('player.global_id')['player.email'].apply(list)

# Get unique text values for each 'player.global_id'
text_data = annotation_records[['player.global_id', 'player.text']].drop_duplicates().set_index('player.global_id')

# Merge the lists of raters and emails with the text data
result_df = overlapping_raters.to_frame(name='raters').join(overlapping_emails.to_frame(name='emails')).join(text_data)
result_df.reset_index(inplace=True)
result_df

Unnamed: 0,player.global_id,raters,emails,player.text
0,ao1008851,"[annotator_ana, annotator_a4, annotator_a4]","[ana, OH., Joy]",arrived in 3 days! Top quality product. this...
1,ao1021586,"[annotator_b1, annotator_ana, annotator_b1]","[cvdm, ana, Haasje]",Perfect as usual.
2,ao1151864,"[annotator_ana, annotator_a3, annotator_a3]","[ana, poalaaj, SpilkoDinkov]",A++++++
3,ao1226263,"[annotator_ana, annotator_a2, annotator_a2]","[ana, Ossie, Kiran_SU31]","First Class all the way, cheers"
4,ao1369676,"[annotator_b1, annotator_ana, annotator_b1]","[cvdm, ana, Haasje]",def better batch then last week. i fault narc...
5,ao1398709,"[annotator_b1, annotator_ana, annotator_b1]","[cvdm, ana, Haasje]",So glad she's back!
6,ao1518297,"[annotator_ana, annotator_a2, annotator_a2]","[ana, Ossie, Kiran_SU31]",great deal 1DD
7,ao1521647,"[annotator_ana, annotator_a4, annotator_a4]","[ana, OH., Joy]",only took 2 days to arrive on pt great seller
8,ao1600017,"[annotator_ana, annotator_a1, annotator_a1]","[ana, DEITY, JensKlooster]",Awesome stuff :)
9,ao1943615,"[annotator_ana, annotator_a2, annotator_a2]","[ana, SpilkoDinkov, Kiran_SU31]",Always the best. Fast shipping and consistent...


Verification of ratings comment per each of unique participan.email:

In [87]:
annotation_records.groupby('player.email')['player.text'].count()

player.email
07554973136           1
7554973136          170
AIperson            144
DEITY                60
F3456                58
Haasje              334
Iri                   1
JensKlooster         47
Joy                 234
Kiran_SU31          286
Mijke                56
Nscarlet             44
OH.                  92
Ossie                30
SofieL               78
SpilkoDinkov        106
ana                1976
annotator_a1         56
annotator_a10        93
annotator_a7         86
blowfish             36
cvdm                 51
dolphin              84
egel                294
gadgetnr6            45
itsme                54
jeansinnott         255
jessicaserena       125
kreaseus             41
marijana             82
onlineresearch1     109
phoenix23            27
poalaaj              66
relaxedtree          34
wojtek              742
Name: player.text, dtype: int64

Change '07554973136' to '7554973136' becuase I think is the same person:

In [8]:
annotation_records['player.email'] = annotation_records['player.email'].replace('07554973136', '7554973136')

## Krippendorff alpha

### Motive

In [33]:
# Prepare the data for Krippendorff's Alpha calculation
def prepare_data_for_krippendorff(data, category):
    pivoted = data.pivot_table(index='player.global_id', columns='participant.label', values=category)
    return pivoted.values


# Function to calculate Krippendorff's Alpha
def nominal_distance(x, y):
    return 0 if x == y else 1

def ordinal_distance(x, y):
    return (x - y) ** 2

def calculate_krippendorff_alpha(data, level_of_measurement):
    """
    Calculate Krippendorff's Alpha for given data and level of measurement.

    Args:
        data (numpy array): Matrix of shape (num_cases, num_raters)
        level_of_measurement (str): 'nominal' or 'ordinal'

    Returns:
        float: Krippendorff's Alpha
    """
    if level_of_measurement == 'nominal':
        distance_function = nominal_distance
    elif level_of_measurement == 'ordinal':
        distance_function = ordinal_distance
    else:
        raise ValueError("Invalid level_of_measurement. Must be 'nominal' or 'ordinal'.")

    n, k = data.shape
    assert n > 1, "Need at least two cases"
    assert k > 1, "Need at least two raters"

    # Handle missing values
    mask = np.isnan(data)
    observed_disagreement = 0
    total_ratings = 0

    # Calculate observed disagreement
    for i in range(n):
        for j in range(k):
            if not mask[i, j]:
                for l in range(j + 1, k):
                    if not mask[i, l]:
                        observed_disagreement += distance_function(data[i, j], data[i, l])
                        total_ratings += 1

    observed_disagreement /= total_ratings

    # Calculate expected disagreement
    all_ratings = data[~mask].flatten()
    unique, counts = np.unique(all_ratings, return_counts=True)
    probabilities = counts / len(all_ratings)

    expected_disagreement = 0
    for i, p_i in enumerate(probabilities):
        for j, p_j in enumerate(probabilities):
            expected_disagreement += p_i * p_j * distance_function(unique[i], unique[j])

    alpha = 1 - observed_disagreement / expected_disagreement
    return alpha


In [38]:
# Extract unique categories
motives = [
    "player.positive_negative",
    "player.help_others",
    "player.avoid_harm",
    "player.help_unrelated",
    "player.reciprocate_seller",
    "player.reach_seller",
    "player.vent_feelings",
    "player.say_facts"
]

# Corresponding labels
labels = [
  "Text polarity",
  "(12) Help other buyers...",
  "(11) Avoid harming the seller...",
  "(10) Help the seller...",
  "(14) Reward of punish the seller...",
  "(4) Reach out to the seller...",
  "(7) Express their feelings...",
  "(6) Share facts..."
]

# Calculate Krippendorff's Alpha for each motive:
alphas = []

for category in motives:
    prepared_data = prepare_data_for_krippendorff(annotation_records, category)
    prepared_data= prepared_data.transpose()
    measurement = 'ordinal' if category == "player.positive_negative" else 'nominal'
    alpha_value = krippendorff.alpha(reliability_data=prepared_data, level_of_measurement=measurement)
    # alpha_value = calculate_krippendorff_alpha(prepared_data, measurement)

    alphas.append(alpha_value)

# DataFrame with the results
irr_results_df = pd.DataFrame({
    'Category': motives,
    'Krippendorff_alpha': alphas,
    'Labels': labels
})

# Sort the DataFrame according to the specified order
sorting_indices = [0, 5, 7, 6, 3, 2, 1, 4]
irr_results_df = irr_results_df.iloc[sorting_indices]

irr_results_df

Unnamed: 0,Category,Krippendorff_alpha,Labels
0,player.positive_negative,0.892071,Text polarity
5,player.reach_seller,0.479104,(4) Reach out to the seller...
7,player.say_facts,0.472878,(6) Share facts...
6,player.vent_feelings,0.297104,(7) Express their feelings...
3,player.help_unrelated,0.241239,(10) Help the seller...
2,player.avoid_harm,0.381382,(11) Avoid harming the seller...
1,player.help_others,0.531204,(12) Help other buyers...
4,player.reciprocate_seller,0.396286,(14) Reward of punish the seller...


Verification my transformed data set for krippendorff:

In [20]:
sample_help_other = prepare_data_for_krippendorff(annotation_records, "player.help_others")
sample_help_other

participant.label,annotator_a1,annotator_a10,annotator_a2,annotator_a3,annotator_a4,annotator_a5,annotator_a6,annotator_a7,annotator_a8,annotator_a9,annotator_ana,annotator_b1,annotator_b10,annotator_b11,annotator_b2,annotator_b3,annotator_b4,annotator_b5,annotator_b6,annotator_b7,annotator_b9,annotator_wojtek
player.global_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ao1003730,,,,,,,,,,,0.0,0.0,0.0,,,,,,,,,
ao1005428,0.0,,,,0.0,,,,,,0.0,,,,,,,,,,,
ao1007033,,,,,,,,,,,0.0,,,,,,,0.0,,,0.0,
ao1008851,,,,,0.0,,,,,,0.0,,,,,,,,,,,
ao1009776,,,0.0,,,,,,,,0.0,,,,,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
as93168,,,,,,,,,,,0.0,0.0,,,,0.0,,,,,,
as93967,0.0,,,,,,,,,,0.0,,0.0,,,,,,,,,
as94848,,,,,,,,,,,0.0,0.0,,,,0.0,,,,,,
as95413,,,,0.0,,,,0.0,,,0.0,,,,,,,,,,,


### Topic

In [42]:
topics =["player.topic_communication",
        "player.topic_payment",
        "player.topic_refund",
        "player.topic_extras",
        "player.topic_value",
        "player.topic_shipping",
        "player.topic_product",
        "player.topic_feedback",
        "player.topic_vendor",
        "player.topic_generic",
        "player.topic_overall",
        "player.topic_other"]

# Calculate Krippendorff's Alpha for each topic:
alphas_topic = []

for category in topics:
    prepared_data = prepare_data_for_krippendorff(annotation_records, category)
    # alpha_value = krippendorff.alpha(reliability_data=prepared_data.transpose(), level_of_measurement=measurement)
    alpha_value = calculate_krippendorff_alpha(prepared_data, level_of_measurement='nominal')
    alphas_topic.append(alpha_value)

# DataFrame with the results
irr_topics = pd.DataFrame({
    'Category': topics,
    'Krippendorff_alpha': alphas_topic
})

irr_topics

Unnamed: 0,Category,Krippendorff_alpha
0,player.topic_communication,0.747162
1,player.topic_payment,0.635743
2,player.topic_refund,0.648458
3,player.topic_extras,0.642262
4,player.topic_value,0.598367
5,player.topic_shipping,0.824105
6,player.topic_product,0.735674
7,player.topic_feedback,0.586143
8,player.topic_vendor,0.554887
9,player.topic_generic,0.765333



Calculating the IRR for each label.

Text polarity evaluated using the ordinal Krippendorff's alpha, the rest
using the nominal variant.

# Check NULL

In [None]:
df_predict[df_predict['category'].isna()== True]

Unnamed: 0,seller,date_left,buyer,comment,itemid,price,rating,category,global_id,dataset
178139,HumboldtFarms,2016-03-21,d...g,"5/5 everything was perfect. Great product, gr...",40256,75.0,Positive,,ao988317,ab
178214,HumboldtFarms,2016-02-11,t...e,"Very nice stealth, amazing product, extremely ...",67535,32.0,Positive,,ao991751,ab
178228,HumboldtFarms,2016-09-16,l...g,Fast shipping and great quality. Thanks Humbolt.,70404,800.0,Positive,,ao970315,ab
178239,HumboldtFarms,2016-06-18,i...n,"took 6 days but well worth the wait, this stuf...",67546,242.0,Positive,,ao979637,ab
178282,HumboldtFarms,2016-09-01,d...n,"Twice as long to pack and ship out, not like o...",70404,807.0,Positive,,ao971902,ab
...,...,...,...,...,...,...,...,...,...,...
1880325,Telek0m,2017-02-22,n...m,"Original Medikamente, annehmbare Lieferzeit (3...",293365,3.2,Positive,,ao1831950,ab
1880328,the-original-pharmacypowder,,d...s,12 day no show with no response for the last c...,168444,10.0,Negative,,ao2243180,ab
1880332,TheCEO,2017-05-04,d...2,Received 4dd. Was 2.1 grams short. A lot more ...,339283,410.0,Neutral,,ao2210479,ab
1880358,weedim,2016-05-15,p...9,FE for my trusted supplier,150199,6.9,Positive,,ao2117649,ab


In [None]:
 df_predict['category'].isna().sum()

In [None]:
 (df_predict['category'].isna().sum()/ df_predict['category'].sum())*100