# Group level stimulus effect in triangle model

## Purpose:
- Examine the group-level stim effect during pretraining (oral) and reading phase in triangle model
- With reference to naming data in ELP 

## Methods:
- Run full training set eval in batch run
- Extract item level eval data
- Subset (or not)
    - Full training set
    - Subset to Strain items (n=160)
- Run LME
    - DV: ACC, RT, cond_RT
    - Non interaction model: freq + OP surprisal + Length + 

### Replicate OSC table 3 with Train set and Strain set

In [None]:
%load_ext lab_black
import pandas as pd
import numpy as np
import os

In [None]:
# # Script to parse new ELP pull on March 12, 2021
# df_elp = pd.DataFrame()

# for i in range(18):
#     file = os.path.join("data/elp_210312_naming/", f"namingData_{i+1}.csv")
#     data = pd.read_csv(file, thousands=",")
#     min_id = data.Sub_ID.min()
#     max_id = data.Sub_ID.max()
#     print(f"{i+1}: min ID = {min_id}, max ID = {max_id}")

#     df_elp = pd.concat([df_elp, data])

# df_elp.to_csv("data/ELP_naming_210321.csv.gz", compression="gzip")

### Import newly pulled ELP naming data 

In [None]:
df_elp = pd.read_csv("data/ELP_naming_210321.csv.gz")

# Remove outliers
df_elp = df_elp.loc[df_elp.Outlier == False]

# Simplified acc (1/0 instead of 1/2/3/4/5...)
df_elp["acc"] = df_elp.D_Accuracy.apply(lambda x: 1 if x == 1 else 0)

# Conditional RT
df_elp["rt"] = df_elp.apply(lambda x: x.D_RT if x.D_Accuracy == 1 else np.NaN, axis=1)

# Rename useful columns
df_elp.rename(
    columns={"Sub_ID": "sub_id", "D_Word": "word"},
    inplace=True,
)

df_elp = df_elp[["sub_id", "word", "acc", "rt"]]
df_elp["log_rt"] = np.log10(df_elp.rt)
df_elp.to_csv("data/df_elp.csv.gz", compression="gzip")

### Item properties

In [None]:
# Create Full dataset from OSC / OP surprisal paper osf files

df_train = pd.read_csv("../../dataset/df_train.csv")
df_train = df_train[["word", "wf", "img"]]

# https://osf.io/d72q4/?view_only=5f5f49f4cda14c3cacea81d295903aac
df_osc = pd.read_csv("data/osc.psc.OSF.csv")
df_osc.rename(
    columns={
        "D_word": "word",
        "O-S consistency": "os",
        "P-S consistency": "ps",
        "orth neighborhood size": "ons",
        "phon neighborhood size": "pns",
    },
    inplace=True,
)


# https://osf.io/w8gjq/download
df_op = pd.read_csv("data/supplementary_material.csv")
df_op.rename(
    columns={
        "uncond.surprisal": "op_uncond",
        "coda.cond.surprisal": "op_coda",
        "onset.cond.surprisal": "op_onset",
    },
    inplace=True,
)

# Merged df
df = df_train.merge(df_osc[["word", "os", "ps", "ons", "pns"]], how="left", on="word")
df = df.merge(
    df_op[["word", "op_uncond", "op_coda", "op_onset"]], how="left", on="word"
)

# Calculate log x
df["log_wf"] = np.log10(df.wf + 1)
df["log_ons"] = np.log10(df.ons + 1)
df["log_pns"] = np.log10(df.pns + 1)
df["word_length"] = df.word.str.len()

df.to_csv("data/item_properties.csv")

### Merge lme data files and export

In [None]:
# Union of Train set
df1 = df_elp.loc[df_elp.word.isin(df.word)]
df1 = df1.merge(df, how="left", on="word")
df1.to_csv("data/lme_df_train.csv.gz", compression="gzip")

In [None]:
# Union of Strain set
strain = pd.read_csv("../../dataset/df_strain.csv")
df2 = df1.loc[df1.word.isin(strain.word)]
len(df2.word.unique())
df2.to_csv("data/lme_df_strain.csv.gz", compression="gzip")