## Job Crafting: Evaluating Intrusion Task

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
PATH = 'intrusion/figures/'

In [None]:
if not os.path.exists(PATH):
    os.makedirs(PATH)

### Evaluate survey results (Prolific, n = 30)

In [None]:
# import data
data1 = pd.read_csv("intrusion/data/tarea-de-intrusion-1-clean.csv")
data2 = pd.read_csv("intrusion/data/tarea-de-intrusion-2-clean.csv")
data3 = pd.read_csv("intrusion/data/tarea-de-intrusion-3-clean.csv")

In [None]:
# plot survey duration in minutes
fig, ax = plt.subplots(figsize=(8, 4))

time = pd.concat([data1.Time, data2.Time, data3.Time])

ax.hist([x/60 for x in time],
        bins=np.arange(0, 61, 2.5),
        color='cornflowerblue')

ax.set_xticks(np.arange(0, 61, 5))
ax.set_yticks(np.arange(0, 8, 1))

for pos in ('top', 'right', 'bottom', 'left'):
    ax.spines[pos].set_visible(False)

ax.set_facecolor('ghostwhite')
ax.grid(axis='y', color='lightgrey', linewidth=1)
ax.set_axisbelow(True)
ax.set_xlabel('Duration (in minutes)', fontsize=12)
ax.set_ylabel('Number of paticipants', fontsize=12)

plt.savefig(PATH + 'response_times.jpeg', dpi=300)

In [None]:
# calculate average survey duration and standard deviation (in minutes)
mean_dur = np.median([x/60 for x in time])
p25_dur = np.percentile([x/60 for x in time], 25)
p75_dur = np.percentile([x/60 for x in time], 75)

print(f"Median duration: {mean_dur:.2f} min")
print(f"25th percentile: {p25_dur:.2f} min")
print(f"75th percentile: {p75_dur:.2f} min")

In [None]:
# check attention checks
print("Attention checks")

check_list = []
for batch in [data1, data2, data3]:
    for participant in batch.iterrows():
        check_part = 0
        if participant[1].A1 not in ["Las rosas son rojas", "El agua está caliente", "Los pájaros nadan"]:
            check_part += 1
        if participant[1].A2 not in ["Tengo una naranja", "Los trenes son rápidos", "llevo un reloj en la muñeca"]:
            check_part += 1
        if participant[1].A3 not in ["Los gatos maúllan", "Voy a cocinar arroz", "Los niños juegan en el parque"]:
            check_part += 1
        if participant[1].A4 not in ["Hoy es martes", "La luna brilla", "Las abejas hacen queso"]:
            check_part += 1
        if participant[1].A5 not in ["Voy al cine", "Hace frío", "Los árboles son pequeños"]:
            check_part += 1
        if participant[1].A6 not in ["Me gusta leer revistas", "Estoy escribiendo un poema", "Hoy es jueves"]:
            check_part += 1
        check_list.append(check_part)

print(f"Participants failing 0 attention checks: {check_list.count(0)}")
print(f"Participants failing 1 attention checks: {check_list.count(1)}")
print(f"Participants failing 2 attention checks: {check_list.count(2)}")
print(f"Participants failing 3 attention checks: {check_list.count(3)}")
print(f"Participants failing 4 attention checks: {check_list.count(4)}")
print(f"Participants failing 5 attention checks: {check_list.count(5)}")
print(f"Participants failing 6 attention checks: {check_list.count(6)}")

In [None]:
# inspect feedback
print("Feedback:")

feedback = pd.concat([data1.Feedback, data2.Feedback, data3.Feedback]).dropna()

if len(feedback) == 0:
    print("N/A")
else:
    for f in feedback:
        print(f)

In [None]:
# get gender distribution of sample
print("Gender distribution")

gender = pd.concat([data1.Gender, data2.Gender, data3.Gender])

def count_gender(code='Mujer'):
    return (list(gender).count(code), round((list(gender).count(code) / len(gender)) * 100, 2))

print(f"Woman: n = {count_gender('Mujer')[0]}, {count_gender('Mujer')[1]} %")
print(f"Man: n = {count_gender('Hombre')[0]}, {count_gender('Hombre')[1]} %")
print(f"Non-binary: n = {count_gender('No binario')[0]}, {count_gender('No binario')[1]} %")
print(f"Self-describe: n = {count_gender('Prefiero autodescribirme (ver abajo)')[0]}, {count_gender('Prefiero autodescribirme (ver abajo)')[1]} %")
print(f"Prefer not to say: n = {count_gender('Prefiero no decirlo')[0]}, {count_gender('Prefiero no decirlo')[1]} %")

In [None]:
# get age distribution of sample
print("Age distribution")

age = pd.concat([data1.Age, data2.Age, data3.Age])

print(f"Average age: {round(np.mean([x for x in age]), 2)}")
print(f"STD: {round(np.std([int(x) for x in age]), 2)}")
print(f"Minimum: {np.min([int(x) for x in age])}")
print(f"Maximum: {np.max([int(x) for x in age])}")

### Analyze intrusion results

In [None]:
# create dataframe with results
results = pd.concat([data1[[f"{x}" for x in range(1, 53)]],
                     data2[[f"{x}" for x in range(1, 53)]],
                     data3[[f"{x}" for x in range(1, 51)]]], axis=1)

results.columns = [f"{x}" for x in range(1, 155)]

print(f"Sanity check // Amount of answers per item: {len(results)}")

In [None]:
# read 'right' answers to survey questions
solution = pd.read_csv('intrusion/survey_solution.txt')
solution.columns = ['Item', 'Cluster', 'Solution']
solution.head()

In [None]:
# construct dataframe with 0 for false and 1 for right answers
array = np.zeros(results.shape)

for i in range(results.shape[1]):
    d = {0: 0, 1: 0, 2: 0, 3: 0}
    d[solution.Solution.iloc[i]] = 1
    array[:, i] = [d[x] for x in list(results[f'{i+1}'])]

survey_yn = pd.DataFrame(array.astype(int))
survey_yn.columns = list(range(1, 155))
survey_yn.head()

In [None]:
# calculate fraction of found intruders per item
frac = [survey_yn[y].sum() / len(survey_yn) for y in range(1, 155)]

# sum up fractions per cluster and store in cluster dict along with items per cluster
raw_tuples = [(x, [0, 0]) for x in range(max(solution.Cluster)+1)]
sum_dict = dict((x, y) for x, y in raw_tuples)

for i in range(len(solution)):
    sum_dict[solution.loc[i].Cluster][0] = sum_dict[solution.loc[i].Cluster][0] + frac[i]
    sum_dict[solution.loc[i].Cluster][1] = sum_dict[solution.loc[i].Cluster][1] + 1

In [None]:
# calculate final results: average fraction of found intruders per cluster
print('INTRUSION SURVEY RESULTS')
print('Cluster, Fraction')
for i in sum_dict:
    print(f'{i}, {(sum_dict[i][0] / sum_dict[i][1]):.2f}')

# store results in file
with open('intrusion_survey_results.txt', 'w') as file:
    file.write('Cluster, Fraction\n')
    for i in sum_dict:
        file.write(f'{i}, {(sum_dict[i][0] / sum_dict[i][1]):.2f}')
        file.write('\n')

In [None]:
# plot fraction of found intruders per cluster (bar plot)
frac_overall = [(sum_dict[x][0] / sum_dict[x][1]) for x in sum_dict]

fig, ax = plt.subplots(figsize=(18, 6))

for pos in ('top', 'right', 'bottom', 'left'):
    ax.spines[pos].set_visible(False)

c = ['cornflowerblue'] * len(frac_overall)
# for i in (0, 8, 22):
#     c[i] = 'coral'
y_pos = np.arange(len(frac_overall))

ax.set_facecolor('ghostwhite')
ax.bar(y_pos, frac_overall, align='center', color=c)
ax.axhline(y=0.67, color='coral', linestyle='--')
ax.grid(axis='y', color='lightgrey', linewidth=1)
ax.set_axisbelow(True)
ax.set_xticks(y_pos, labels=np.arange(1, len(frac_overall)+1))
ax.set_ylabel('Fraction of found intruders', fontsize=14)
ax.set_xlabel('Cluster number', fontsize=14)

plt.savefig(PATH + 'fraction_intruders_per_cluster.jpeg', dpi=300)

In [None]:
# visualize distribution of fraction of found intruders in clusters
survey_analysis = pd.read_csv('intrusion_survey_results.txt')
survey_analysis.columns = ['Cluster', 'Fraction']

plt.figure(figsize=(10,6))
plt.hist(survey_analysis.Fraction, rwidth=0.9, color='teal', bins=np.arange(0, 1.01, 0.05))
plt.grid(axis='y', color='lightgrey', linewidth=0.5)
plt.xlabel('Average fraction of intruders found')
plt.ylabel('Frequency')
plt.title('Frequency of fractions of found intruders in clusters')

plt.savefig(PATH + 'fraction_found_intruders_hist.jpeg', dpi=300)

### Setting the cut-off score

In [None]:
print(f"With a 2/3 majority vote, we discard {list(survey_analysis.Fraction < 0.67).count(True)}/40 clusters.")

print("Remaining clusters:")
for i in range(len(survey_analysis)):
    if survey_analysis.Fraction.iloc[i] >= 0.67:
        print(survey_analysis.Cluster.iloc[i]+1)
print("Beware: Python 0=1, 1=2, ...")