# INITIALIZATION

# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from google.colab import drive
from os.path import join

import re
from scipy.stats import describe
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

from matplotlib import pyplot as plt
import seaborn as sns

import random

# Load data

In [2]:
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/Thesis/Data"
FEEDBACK_DIR = join(PROJECT_DIR, "Feedback data")
PROCESSED_DIR = join(PROJECT_DIR, "Processed")

Mounted at /content/drive


Loading the dataset with all transaction data:


In [3]:
ab_full_feedback = pd.read_csv(join(FEEDBACK_DIR,"ab_full_feedback_data.csv"), encoding='latin1')

Loading the dataset with selected transaction data:

In [4]:
ab_text_feedback = pd.read_csv(join(FEEDBACK_DIR,"ab_selected_feedback_data.csv"), encoding='latin1')

Loading the dataset with manually coded
 data:



In [5]:
manually_coded = pd.read_csv(join(PROJECT_DIR,"Annotation data/manual_coding_majority_vote_2000_texts.csv"), encoding='latin1')
annotation_records = pd.read_csv(join(PROJECT_DIR,"Annotation data/manual_coding_6000_texts.csv"), encoding='latin1')

# 1 Calculating the IRR


## Analyzing the distribution of raters per text

In [None]:
print(f'Number of unique participant label:', annotation_records['participant.label'].nunique())
print(f'Number of unique participant code:', annotation_records['participant.code'].nunique())
print(f'Number of unique player email:', annotation_records['player.email'].nunique())

Number of unique participant label: 22
Number of unique participant code: 99
Number of unique player email: 35


In [None]:
sorted(annotation_records['participant.label'].unique())

['annotator_a1',
 'annotator_a10',
 'annotator_a2',
 'annotator_a3',
 'annotator_a4',
 'annotator_a5',
 'annotator_a6',
 'annotator_a7',
 'annotator_a8',
 'annotator_a9',
 'annotator_ana',
 'annotator_b1',
 'annotator_b10',
 'annotator_b11',
 'annotator_b2',
 'annotator_b3',
 'annotator_b4',
 'annotator_b5',
 'annotator_b6',
 'annotator_b7',
 'annotator_b9',
 'annotator_wojtek']

In [None]:
sorted(annotation_records['player.email'].unique())
# I guess that '07554973136' and '7554973136'is the same person

['07554973136',
 '7554973136',
 'AIperson',
 'DEITY',
 'F3456',
 'Haasje',
 'Iri',
 'JensKlooster',
 'Joy',
 'Kiran_SU31',
 'Mijke',
 'Nscarlet',
 'OH.',
 'Ossie',
 'SofieL',
 'SpilkoDinkov',
 'ana',
 'annotator_a1',
 'annotator_a10',
 'annotator_a7',
 'blowfish',
 'cvdm',
 'dolphin',
 'egel',
 'gadgetnr6',
 'itsme',
 'jeansinnott',
 'jessicaserena',
 'kreaseus',
 'marijana',
 'onlineresearch1',
 'phoenix23',
 'poalaaj',
 'relaxedtree',
 'wojtek']

Let's check overlapping:

In [None]:
unique_assignments = annotation_records[['participant.label', 'player.email']].drop_duplicates().sort_values(by = 'participant.label')
unique_assignments

Unnamed: 0,participant.label,player.email
4556,annotator_a1,JensKlooster
58,annotator_a1,egel
3773,annotator_a1,itsme
4739,annotator_a1,marijana
3029,annotator_a1,DEITY
...,...,...
1931,annotator_b7,jeansinnott
467,annotator_b7,Kiran_SU31
891,annotator_b7,egel
453,annotator_b9,Joy


In [None]:
unique_assignments_email = annotation_records[['player.email', 'participant.label']].drop_duplicates().sort_values(by = 'player.email')
unique_assignments_email

Unnamed: 0,player.email,participant.label
5615,07554973136,annotator_a2
890,7554973136,annotator_b6
889,7554973136,annotator_b5
886,7554973136,annotator_b1
2081,AIperson,annotator_a5
...,...,...
515,onlineresearch1,annotator_b1
1419,phoenix23,annotator_a4
2080,poalaaj,annotator_a3
530,relaxedtree,annotator_b3


 Let's check overlapping raters and emails for texts with exactly two unique raters:

In [None]:
# Filter texts rated by exactly two unique raters
text_rater_count = annotation_records.groupby('player.global_id')['participant.label'].nunique()
overlapping_texts = text_rater_count[text_rater_count == 2].index

# Get records with overlapping raters
overlapping_records = annotation_records[annotation_records['player.global_id'].isin(overlapping_texts)]

# Create lists of raters and emails
overlapping_raters = overlapping_records.groupby('player.global_id')['participant.label'].apply(list)
overlapping_emails = overlapping_records.groupby('player.global_id')['player.email'].apply(list)

# Get unique text values for each 'player.global_id'
text_data = annotation_records[['player.global_id', 'player.text']].drop_duplicates().set_index('player.global_id')

# Merge the lists of raters and emails with the text data
result_df = overlapping_raters.to_frame(name='raters').join(overlapping_emails.to_frame(name='emails')).join(text_data)
result_df.reset_index(inplace=True)
result_df

Unnamed: 0,player.global_id,raters,emails,player.text
0,ao1008851,"[annotator_ana, annotator_a4, annotator_a4]","[ana, OH., Joy]",arrived in 3 days! Top quality product. this...
1,ao1021586,"[annotator_b1, annotator_ana, annotator_b1]","[cvdm, ana, Haasje]",Perfect as usual.
2,ao1151864,"[annotator_ana, annotator_a3, annotator_a3]","[ana, poalaaj, SpilkoDinkov]",A++++++
3,ao1226263,"[annotator_ana, annotator_a2, annotator_a2]","[ana, Ossie, Kiran_SU31]","First Class all the way, cheers"
4,ao1369676,"[annotator_b1, annotator_ana, annotator_b1]","[cvdm, ana, Haasje]",def better batch then last week. i fault narc...
5,ao1398709,"[annotator_b1, annotator_ana, annotator_b1]","[cvdm, ana, Haasje]",So glad she's back!
6,ao1518297,"[annotator_ana, annotator_a2, annotator_a2]","[ana, Ossie, Kiran_SU31]",great deal 1DD
7,ao1521647,"[annotator_ana, annotator_a4, annotator_a4]","[ana, OH., Joy]",only took 2 days to arrive on pt great seller
8,ao1600017,"[annotator_ana, annotator_a1, annotator_a1]","[ana, DEITY, JensKlooster]",Awesome stuff :)
9,ao1943615,"[annotator_ana, annotator_a2, annotator_a2]","[ana, SpilkoDinkov, Kiran_SU31]",Always the best. Fast shipping and consistent...


Verification of ratings comment per each of unique participan.email:

In [None]:
annotation_records.groupby('player.email')['player.text'].count()

player.email
07554973136           1
7554973136          170
AIperson            144
DEITY                60
F3456                58
Haasje              334
Iri                   1
JensKlooster         47
Joy                 234
Kiran_SU31          286
Mijke                56
Nscarlet             44
OH.                  92
Ossie                30
SofieL               78
SpilkoDinkov        106
ana                1976
annotator_a1         56
annotator_a10        93
annotator_a7         86
blowfish             36
cvdm                 51
dolphin              84
egel                294
gadgetnr6            45
itsme                54
jeansinnott         255
jessicaserena       125
kreaseus             41
marijana             82
onlineresearch1     109
phoenix23            27
poalaaj              66
relaxedtree          34
wojtek              742
Name: player.text, dtype: int64

Change '07554973136' to '7554973136':

In [None]:
annotation_records['player.email'] = annotation_records['player.email'].replace('07554973136', '7554973136')

## Krippendorff alpha, Fleiss Kappas and PABAKS

### Motives & sentiment
First I check if my code works correctly and I receive the same results as in the article:

In [None]:
# Prepare the data for Krippendorff's Alpha calculation
def prepare_data_for_krippendorff(data, category):
    pivoted = data.pivot_table(index='player.global_id', columns='participant.label', values=category)
    return pivoted.values

# Function to calculate Krippendorff's Alpha
def nominal_distance(x, y):
    return 0 if x == y else 1

def ordinal_distance(x, y):
    return (x - y) ** 2

def calculate_krippendorff_alpha(data, level_of_measurement):
    """
    Calculate Krippendorff's Alpha for given data and level of measurement.

    Args:
        data (numpy array): Matrix of shape (num_cases, num_raters)
        level_of_measurement (str): 'nominal' or 'ordinal'

    Returns:
        float: Krippendorff's Alpha
    """
    if level_of_measurement == 'nominal':
        distance_function = nominal_distance
    elif level_of_measurement == 'ordinal':
        distance_function = ordinal_distance
    else:
        raise ValueError("Invalid level_of_measurement. Must be 'nominal' or 'ordinal'.")

    n, k = data.shape
    assert n > 1, "Need at least two cases"
    assert k > 1, "Need at least two raters"

    # Handle missing values
    mask = np.isnan(data)
    observed_disagreement = 0
    total_ratings = 0

    # Calculate observed disagreement
    for i in range(n):
        for j in range(k):
            if not mask[i, j]:
                for l in range(j + 1, k):
                    if not mask[i, l]:
                        observed_disagreement += distance_function(data[i, j], data[i, l])
                        total_ratings += 1

    observed_disagreement /= total_ratings

    # Calculate expected disagreement
    all_ratings = data[~mask].flatten()
    unique, counts = np.unique(all_ratings, return_counts=True)
    probabilities = counts / len(all_ratings)

    expected_disagreement = 0
    for i, p_i in enumerate(probabilities):
        for j, p_j in enumerate(probabilities):
            expected_disagreement += p_i * p_j * distance_function(unique[i], unique[j])

    alpha = 1 - observed_disagreement / expected_disagreement
    return alpha

# Prepare the data for Fleiss Kappa calculation
def prepare_data_for_fleiss_kappa(data, category):
    pivoted = data.pivot_table(index='player.global_id', columns='participant.label', values=category, aggfunc='first')

    # Remove duplicate ratings by keeping the first occurrence
    pivoted = pivoted.groupby(level=0).first()

    # Ensure consistent number of raters
    mode_raters = pivoted.notna().sum(axis=1).mode()[0]

    # Create an array to hold the counts
    num_categories = 5  # Assuming ratings are integers from 0 to 4
    ratings = []

    for row in pivoted.itertuples(index=False):
        valid_ratings = [int(x) for x in row if not np.isnan(x)]
        if len(valid_ratings) != mode_raters:
            continue  # Skip rows that do not have the consistent number of raters
        counts = np.bincount(valid_ratings, minlength=num_categories)
        ratings.append(counts)

    return np.array(ratings)

# PABAK
def calculate_pabak(data):
    n, k = data.shape
    agreement_count = 0
    total_pairs = 0

    for i in range(n):
        ratings = data[i, :]
        non_nan_ratings = ratings[~np.isnan(ratings)]
        if len(non_nan_ratings) > 1:
            unique, counts = np.unique(non_nan_ratings, return_counts=True)
            agreement_count += sum(count * (count - 1) for count in counts)
            total_pairs += len(non_nan_ratings) * (len(non_nan_ratings) - 1)

    po = agreement_count / total_pairs if total_pairs > 0 else 0
    pabak = 2 * po - 1
    return pabak

In [None]:
# Extract unique categories
motives = [
    "player.positive_negative",
    "player.help_others",
    "player.avoid_harm",
    "player.help_unrelated",
    "player.reciprocate_seller",
    "player.reach_seller",
    "player.vent_feelings",
    "player.say_facts"
]

# Corresponding labels
labels = [
  "Text polarity",
  "(12) Help other buyers...",
  "(11) Avoid harming the seller...",
  "(10) Help the seller...",
  "(14) Reward of punish the seller...",
  "(4) Reach out to the seller...",
  "(7) Express their feelings...",
  "(6) Share facts..."
]

# Calculate Krippendorff's Alpha for each motive:
alphas = []
kappas = []
pabaks = []

for category in motives:
    prepared_data = prepare_data_for_krippendorff(annotation_records, category)
    measurement = 'ordinal' if category == "player.positive_negative" else 'nominal'
    alpha_value = calculate_krippendorff_alpha(prepared_data, measurement) # my code
    alphas.append(round(alpha_value, 2))

    fleiss_data = prepare_data_for_fleiss_kappa(annotation_records, category)
    if fleiss_data.shape[0] > 0:  # Ensure there's data to process
        kappa_value = fleiss_kappa(fleiss_data)
        kappas.append(round(kappa_value, 2))
    else:
        kappas.append(np.nan)  # Append NaN if no data is available

    pabak_value = calculate_pabak(prepared_data)
    pabaks.append(round(pabak_value, 2))

# DataFrame with the results
irr_results_df = pd.DataFrame({
    'Category': motives,
    'Krippendorff_alpha': alphas,
    'Fleiss_kappa': kappas,
    'PABAK': pabaks,
    'Labels': labels
})

# Sort the DataFrame according to the specified order
sorting_indices = [0, 5, 7, 6, 3, 2, 1, 4]
irr_results_df = irr_results_df.iloc[sorting_indices]

irr_results_df

Unnamed: 0,Category,Krippendorff_alpha,Fleiss_kappa,PABAK,Labels
0,player.positive_negative,0.89,0.73,0.66,Text polarity
5,player.reach_seller,0.48,0.48,0.9,(4) Reach out to the seller...
7,player.say_facts,0.48,0.48,0.53,(6) Share facts...
6,player.vent_feelings,0.3,0.3,0.42,(7) Express their feelings...
3,player.help_unrelated,0.25,0.24,0.84,(10) Help the seller...
2,player.avoid_harm,0.38,0.39,0.86,(11) Avoid harming the seller...
1,player.help_others,0.53,0.53,0.71,(12) Help other buyers...
4,player.reciprocate_seller,0.4,0.4,0.65,(14) Reward of punish the seller...


### Topics & emotional

In [None]:
topics =[
        'player.unemotional_emotional',
        "player.topic_communication",
        "player.topic_payment",
        "player.topic_refund",
        "player.topic_extras",
        "player.topic_value",
        "player.topic_shipping",
        "player.topic_product",
        "player.topic_feedback",
        "player.topic_vendor",
        "player.topic_generic",
        "player.topic_overall",
        "player.topic_other"]

# Calculate Krippendorff's Alpha for each topic:
alphas_topic = []
kappas_topc = []
pabaks_topic = []

for category in topics:
    prepared_data = prepare_data_for_krippendorff(annotation_records, category)
    measurement = 'ordinal' if category == "player.positive_negative" else 'nominal'
    alpha_value = calculate_krippendorff_alpha(prepared_data, measurement)
    alphas_topic.append(round(alpha_value, 3))

    fleiss_data = prepare_data_for_fleiss_kappa(annotation_records, category)
    if fleiss_data.shape[0] > 0:  # Ensure there's data to process
        kappa_value = fleiss_kappa(fleiss_data)
        kappas_topc.append(round(kappa_value, 3))
    else:
        kappas_topc.append(np.nan)  # Append NaN if no data is available

    pabak_value = calculate_pabak(prepared_data)
    pabaks_topic.append(round(pabak_value, 3))

irr_topics = pd.DataFrame({
    'Category': topics,
    'Krippendorff_alpha': alphas_topic,
    'Fleiss_kappa': kappas_topc,
    'PABAK': pabaks_topic
})

irr_topics

Unnamed: 0,Category,Krippendorff_alpha,Fleiss_kappa,PABAK
0,player.unemotional_emotional,0.29,0.289,0.026
1,player.topic_communication,0.747,0.748,0.848
2,player.topic_payment,0.636,0.635,0.943
3,player.topic_refund,0.648,0.649,0.934
4,player.topic_extras,0.642,0.645,0.962
5,player.topic_value,0.598,0.6,0.903
6,player.topic_shipping,0.824,0.824,0.824
7,player.topic_product,0.736,0.736,0.739
8,player.topic_feedback,0.586,0.585,0.949
9,player.topic_vendor,0.555,0.556,0.719


#### Emotion - changing category

The lowest PABAK coefficient was found in the text emotionality category (0.026), underscoring the challenges associated with assessing the emotional content of text. The low agreement in this category may stem from the nuanced and subjective nature of interpreting emotions, which can vary greatly among individual.

We can merge the 4-categories into 2:<br>
Text originally coded as very emotional or rather emotional will be reclassified as emotional, while the rest is marked as unemotiona


In [18]:
annotation_records['player.unemotional_emotional'].value_counts()

player.unemotional_emotional
1    2455
2    1965
3    1116
4     464
Name: count, dtype: int64

In [19]:
annotation_records['player.unemotional_emotional'] = annotation_records['player.unemotional_emotional'].apply(lambda x: 1 if x in [3, 4] else 0)

In [20]:
annotation_records['player.unemotional_emotional'].value_counts()

player.unemotional_emotional
0    4420
1    1580
Name: count, dtype: int64

Now we can check again IRR:

In [None]:
# Calculate Krippendorff's Alpha for each topic:
alphas_topic = []
kappas_topc = []
pabaks_topic = []

for category in topics:
    prepared_data = prepare_data_for_krippendorff(annotation_records, category)
    measurement = 'ordinal' if category == "player.positive_negative" else 'nominal'
    alpha_value = calculate_krippendorff_alpha(prepared_data, measurement)
    alphas_topic.append(round(alpha_value, 3))

    fleiss_data = prepare_data_for_fleiss_kappa(annotation_records, category)
    if fleiss_data.shape[0] > 0:  # Ensure there's data to process
        kappa_value = fleiss_kappa(fleiss_data)
        kappas_topc.append(round(kappa_value, 3))
    else:
        kappas_topc.append(np.nan)  # Append NaN if no data is available

    pabak_value = calculate_pabak(prepared_data)
    pabaks_topic.append(round(pabak_value, 3))

# DataFrame with the results
irr_topics = pd.DataFrame({
    'Category': topics,
    'Krippendorff_alpha': alphas_topic,
    'Fleiss_kappa': kappas_topc,
    'PABAK': pabaks_topic
})

irr_topics

Unnamed: 0,Category,Krippendorff_alpha,Fleiss_kappa,PABAK
0,player.unemotional_emotional,0.402,0.4,0.534
1,player.topic_communication,0.747,0.748,0.848
2,player.topic_payment,0.636,0.635,0.943
3,player.topic_refund,0.648,0.649,0.934
4,player.topic_extras,0.642,0.645,0.962
5,player.topic_value,0.598,0.6,0.903
6,player.topic_shipping,0.824,0.824,0.824
7,player.topic_product,0.736,0.736,0.739
8,player.topic_feedback,0.586,0.585,0.949
9,player.topic_vendor,0.555,0.556,0.719


# 2 DESCRIPTIVE STATISTICS

## FULL FEEDBACK

**Share of feedback with textual data**

Explore whether the feedback is default text or have NaN **values**:


In [None]:
# Define default text lists
ab_default_text = ["No comment"]

# Count NaN values in 'comment'
ab_nan_count = ab_full_feedback['comment'].isna().sum()

# Apply default text conditions
ab_full_feedback['default_text'] = np.where(ab_full_feedback['comment'].isin(ab_default_text), True, False)
ab_default_com_count = ab_full_feedback['default_text'].sum()

# Calculate proportions of non-NaN and False default_text
ab_total_count = len(ab_full_feedback)

prop_valid_non_default_ab = round(((ab_full_feedback['comment'].notna()) & (ab_full_feedback['default_text'] == False)).sum() / ab_total_count * 100, 2)

print(f"""-----------------------
Number of NaN values in AlphaBay comments: {ab_nan_count}
Number of default comments in AlphaBay: {ab_default_com_count}
Proportion of valid non-default comments in AlphaBay: {prop_valid_non_default_ab}%
-----------------------""")

-----------------------
Number of NaN values in AlphaBay comments: 9
Number of default comments in AlphaBay: 439714
Proportion of valid non-default comments in AlphaBay: 81.38%
-----------------------


**Share of pos/neu/neg transactions**

In [None]:
# Calculate proportions of ratings for AlphaBay
prop_rating_ab = round(ab_full_feedback['rating'].value_counts(normalize=True) * 100, 2)

# Prepare the table with necessary row names
rownames_table = ["Positive", "Neutral", "Negative", "With text", "(N =)"]

# Create DataFrame for Negative, Neutral, Positive ratings
rows1_3 = pd.DataFrame({
    "AlphaBay": prop_rating_ab.loc[['Negative', 'Neutral', 'Positive']]
})

# Calculate proportion of feedbacks with text for AlphaBay
prop_valid_non_default_ab = ((ab_full_feedback['comment'].notna()) & (ab_full_feedback['default_text'] == False)).mean() * 100

# Create DataFrame for 'With text' row
row4 = pd.DataFrame({
    "AlphaBay": [prop_valid_non_default_ab]
}, index=["With text"])

# Calculate count of non-default feedbacks
count_row5 = pd.DataFrame({
    "AlphaBay": [((ab_full_feedback['comment'].notna()) & (ab_full_feedback['default_text'] == False)).sum()]
}, index=["(N =)"])

# Combine all rows to form the final table
table = pd.concat([rows1_3, row4, count_row5])

# Apply the percentage format to the first four rows
table.iloc[:4] = table.iloc[:4].applymap(lambda x: f"{x:.2f}%" if pd.notnull(x) else x)

# Format the fifth row with thousand separators
table.iloc[4] = table.iloc[4].apply(lambda x: f"{x:,.0f}".replace(",", " ") if pd.notnull(x) else x)

# Insert the row labels column at the beginning
table.insert(0, 'Feedbacks', rownames_table)

table

Unnamed: 0,Feedbacks,AlphaBay
Negative,Positive,2.56%
Neutral,Neutral,1.21%
Positive,Negative,96.23%
With text,With text,81.38%
(N =),(N =),1 921 591


## SELECTED FEEDBACK

Check if we have Null values in commnents:

In [None]:
# Count NaN values in 'comment'
ab_nan_c = ab_text_feedback['comment'].isna().sum()

print(f"""-----------------------
Number of NaN values in AlphaBay comments: {ab_nan_c}
-----------------------""")

-----------------------
Number of NaN values in AlphaBay comments: 382
-----------------------


In [None]:
ab_text_feedback = ab_text_feedback.loc[ab_text_feedback['comment'].notna()]

Determine observations containing symbols only:


In [None]:
# function that checks whether a comment contains letters
def symbols_only(comment):
    if isinstance(comment, str) and re.search("[A-Za-z]+", comment):
        return "no"
    else:
        return "yes"

ab_text_feedback['symbols_only'] = ab_text_feedback['comment'].apply(symbols_only)

Get the statistics:


Determin number of rows:

In [None]:
ab_nrows = ab_text_feedback.shape[0]

print(f"""-----------------------
Number of values in AlphaBay: {ab_nrows}
-----------------------""")

-----------------------
Number of values in AlphaBay: 1703887
-----------------------


In [None]:
# Filtering comments that contain letters
filtered_dataset = ab_text_feedback[ab_text_feedback['symbols_only'] == "no"].copy()

# Calculate the number of words in the comment
filtered_dataset['n_words'] = filtered_dataset['comment'].apply(lambda x: len(re.findall(r'\b\w+\b', x)))

# Number of characters with punctuation
filtered_dataset['char_punct'] = filtered_dataset['comment'].apply(lambda x: len(re.findall(r"[^ ]", x)))

# Number of characters without punctuation
filtered_dataset['char_no_punct'] = filtered_dataset['comment'].apply(lambda x: len(re.findall(r"[a-zA-Z0-9]", x)))

# Number of punctuation characters
filtered_dataset['punct'] = filtered_dataset['char_punct'] - filtered_dataset['char_no_punct']

ab_text_statistics = filtered_dataset[['global_id', 'comment', 'rating', 'n_words', 'char_punct', 'char_no_punct', 'punct']]
ab_text_statistics.head()

Unnamed: 0,global_id,comment,rating,n_words,char_punct,char_no_punct,punct
0,ao978412,Best vendor. Amazing Quality. Stealth on point...,Positive,10,58,51,7
1,ao977151,"Amazing product, awesome stealth; 5dd; 2 pills...",Positive,11,51,48,3
2,ao972046,Fast Shipping!!!,Positive,2,15,12,3
3,ao2200726,"ordered 5 carts, got only one full one all the...",Neutral,35,140,135,5
4,ao1004619,FE'd,Positive,2,4,3,1


In [None]:
def calculate_descriptives(dataframe, columns_to_exclude):
    columns = [col for col in dataframe.columns if col not in columns_to_exclude]
    data = dataframe[columns]
    results = {
        "Mean": data.mean(),
        "SD": data.std(),
        "Median": data.median(),
        "Min": data.min(),
        "Max": data.max()
    }
    desc_df = pd.DataFrame(results).T.round(2)
    desc_df.columns = columns
    return desc_df.T

columns_to_exclude = ['char_no_punct']

ab_descriptives = calculate_descriptives(ab_text_statistics.iloc[:, 3:7], columns_to_exclude)

ab_descriptives = ab_descriptives.reset_index()
ab_descriptives = ab_descriptives.rename(columns={'index': "AlphaBay (n = 1 703 887)"})

first_col = [
    "N words",
    "N char (no punct.)",
    "N punct. char",
]
ab_descriptives.iloc[:, 0] = first_col
ab_descriptives

Unnamed: 0,AlphaBay (n = 1 703 887),Mean,SD,Median,Min,Max
0,N words,9.97,9.27,7.0,1.0,533.0
1,N char (no punct.),47.19,40.91,36.0,1.0,1129.0
2,N punct. char,2.64,3.29,2.0,0.0,1056.0


# 3 Dataset for prediction

Get the dataset for the final prediction (exclude manually coded sample):

In [None]:
# Anti-join
ab_for_prediction = ab_text_feedback[~ab_text_feedback['global_id'].isin(manually_coded['global_id'])]

# # Tidy up the dates
ab_for_prediction.loc[:, 'date_left'] = pd.to_datetime(ab_for_prediction['date_left'].str.slice(stop=-6), format='%b %d, %Y', errors='coerce').dt.date

# Combine datasets
prediction_dataset = pd.concat([ab_for_prediction], ignore_index=True)

Save the file

In [None]:
prediction_dataset.to_csv(join(PROCESSED_DIR, "data_for_prediction.csv"), index=False)

# 4 Manual Coding with Majority Vote Dataset (topic, emotion)
Since I change categories in variable 'emotionality' I need to create this dataset again:

In [21]:
manually_coded.shape[1]

30

In [22]:
selected_col = ["participant.code",
                "participant.label",
                "player.question_id",
                "player.text",
                "player.global_id",
                'player.topic_communication',
                'player.topic_payment',
                'player.topic_refund',
                'player.topic_extras',
                'player.topic_value',
                'player.topic_shipping',
                'player.topic_product',
                'player.topic_feedback',
                'player.topic_vendor',
                'player.topic_generic',
                'player.topic_overall',
                'player.topic_other',
                'player.unemotional_emotional']

annotation_records_filter = annotation_records[selected_col]
annotation_records_filter = annotation_records_filter.sort_values(by=['player.global_id', 'participant.label'])

In [23]:
# Initialize variables
i = 1
j = 1
n_annot = 3
n_labels = 13

# Create an empty DataFrame with specified column names
colnames_majority_vote = ['comment', 'global_id', 't_communication', 't_payment', 't_refund',
       't_price', 't_value', 't_shipping', 't_product', 't_feedback',
       't_vendor', 't_generic', 't_overall', 't_other', 'emo_une']

majority_vote_output = pd.DataFrame(columns=colnames_majority_vote)

# Function to compute majority vote
def majority_vote(scores):
    return np.bincount(scores).argmax()

# Loop over the data in steps of n_annot
for i in range(0, len(annotation_records_filter), n_annot):
    get_text = annotation_records_filter.iloc[i, 3]
    get_global_id = annotation_records_filter.iloc[i, 4]
    labels = []

    # Compute the majority vote for each label
    for j in range(n_labels):
        get_label_scores = annotation_records_filter.iloc[i:i+n_annot, 5+j].astype(int)
        get_majority = majority_vote(get_label_scores)
        labels.append(get_majority)

    # Create a new row and append it to the DataFrame
    row = [get_text, get_global_id] + labels
    temp_df = pd.DataFrame([row], columns=colnames_majority_vote)
    majority_vote_output = pd.concat([majority_vote_output, temp_df], ignore_index=True)


In [24]:
majority_vote_output['emo_une'].value_counts()

emo_une
0    1529
1     471
Name: count, dtype: int64

Save file:

In [25]:
majority_vote_output.to_csv(join(PROJECT_DIR, "Annotation data/manual_coding_majority_vote_2000_(emotionality_recoding).csv"), index=False)