In [8]:
import ast

import pandas as pd

In [25]:
responses = pd.read_csv("../data/responses_full.csv")
codes = pd.read_csv("../data/coded_headlines.tsv", sep="\t")

In [26]:
responses['history'] = responses['history'].apply(ast.literal_eval)
responses['choices'] = responses['choices'].apply(ast.literal_eval)
responses = responses.drop_duplicates(subset=["user_id"])

In [28]:
def cleanup_item(item):
    return item.replace("#### ", "").replace("**", "").replace("\n", ". ")

In [43]:
def expand_choices(row):
    choices = row["choices"]
    num_selected = len(row["history"])
    generated_headlines = []
    for _, d in choices.items():
        options = d["options"]
        selected = d["selected"]
        for hed, source in options.items():
            if source.lower() == "generated":
                generated_headlines.append({
                    "headline": cleanup_item(hed),
                    "was_selected": hed == selected,
                    "num_selected": num_selected
                })
    
    return generated_headlines


In [44]:
responses["gen_headlines"] = responses.apply(expand_choices, axis=1)

In [46]:
gen_headlines = pd.DataFrame([i for l in responses.gen_headlines.tolist() for i in l])

In [47]:
# Merge gen_headlines with codes on headline text
gen_headlines_with_codes = gen_headlines.merge(
    codes,
    left_on='headline',
    right_on='Headline',
    how='left'
)

In [51]:
def euclidian_distance(row):
    base_coords = (1, 2)
    row_coords = (row["Item number 1"], row["Item number 2"])
    return ((base_coords[0] - row_coords[0])**2 + (base_coords[1] - row_coords[1])**2)**0.5

In [52]:
gen_headlines_with_codes["distance"] = gen_headlines_with_codes.apply(euclidian_distance, axis=1)

In [55]:
correlations = gen_headlines_with_codes[['was_selected', 'num_selected', 'distance']].corr(method='spearman')
print(correlations)

              was_selected  num_selected  distance
was_selected      1.000000     -0.015658  0.044761
num_selected     -0.015658      1.000000 -0.229848
distance          0.044761     -0.229848  1.000000
