# Import dependencies

In [76]:
import pathlib
import json, csv
from pprint import pprint
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import anova
from statsmodels.graphics.factorplots import interaction_plot

from scipy import stats

%matplotlib inline

In [10]:
col_types = np.empty((0,), dtype=[
    ('trial', np.uint8),
    ('critical_target', np.unicode), ('critical_target_syllable', np.unicode),
    ('distractor', np.unicode), ('distractor_syllable', np.unicode),
    ('distractor_label', np.unicode),
    ('phon_sim', np.bool_), ('orth_sim', np.bool_),
    ('phon_distance', np.float64), ('orth_similarity', np.float64),
    ('phon_distance_normalized', np.float64), ('orth_similarity_normalized', np.float64),
    ('phon_distance_z', np.float64), ('orth_similarity_z', np.float64)
])

df = pd.DataFrame(col_types)

print(df.dtypes)

trial                           uint8
critical_target                object
critical_target_syllable       object
distractor                     object
distractor_syllable            object
distractor_label               object
phon_sim                         bool
orth_sim                         bool
phon_distance                 float64
orth_similarity               float64
phon_distance_normalized      float64
orth_similarity_normalized    float64
phon_distance_z               float64
orth_similarity_z             float64
dtype: object


# Load syllables

Open the original raw, unfiltered dataset and extract the syllables that correspond with each character.

In [11]:
syllables = dict()

raw_file = pathlib.Path('stimuli/stimulus_set-01.csv')
with raw_file.open('r') as f:
    csvreader = csv.reader(f, delimiter='\t')
    
    for row in csvreader:
        if not row:
            continue
        
        if row[0] in syllables:
            continue
            
        syllables[row[0]] = row[1]
        
syllables_set = set(syllables.values())

print('Number unique characters:', len(syllables))
print('Number unique syllables:', len(syllables_set))

Number unique characters: 430
Number unique syllables: 258


# Load trials

Load trials from the actual trials.json file used in the experiment. Extract the stimulus sets and create a set of all characters used.

In [12]:
char_sets = []
all_chars = set()

row_count = 1

trials_file = pathlib.Path('stimuli/trials.json')
with trials_file.open('r') as f:
    trials = json.load(f)['sentences']
    
    for trial in trials:
        if 'original_distractors' in trial:
            key = 'original_distractors'
        else:
            key = 'distractors'
        
        # The actual character sets, indexed by the critical character
        values = trial[key]
        for label, character in values.items():
            phon_sim = np.bool(False)
            orth_sim = np.bool(False)
            
            if label == 'both_sim' or label == 'phon_sim':
                phon_sim = np.bool(True)
            if label == 'both_sim' or label == 'orth_sim':
                orth_sim = np.bool(True)
                
            
            df.loc[row_count] = [trial['sentence_number'], trial['critical_target'], "", character, "", label, phon_sim, orth_sim, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
            row_count += 1
            
        values['critical_target'] = trial['critical_target']
        char_sets.append(values)
        
        # A set containing all unique characters
        all_chars.update(trial[key].values())
        
    print('Number of stimulus sets loaded:    ', len(trials))
    print('Number of stimulus sets extracted: ', len(char_sets))
print('Number of uniqe characters:', len(all_chars))

Number of stimulus sets loaded:     96
Number of stimulus sets extracted:  96
Number of uniqe characters: 413


# Parse character comparisons

In [5]:
distances = dict()
max_orth_similarity = 0

distances_file = pathlib.Path('/Users/nick/github/orthophonology/data/char_comparisons.csv')
with distances_file.open('r') as f:
    csvreader = csv.reader(f)
    
    count = 0
    distances_count = 0
    for row in csvreader:
        if float(row[2]) > max_orth_similarity:
            max_orth_similarity = float(row[2])
        
        if row[0] in all_chars and row[1] in all_chars:
            chars = sorted(row[0:2])
            
            if chars[0] not in distances:
                distances[chars[0]] = {
                    chars[1]: row[2]
                }
                distances_count += 1
            else:
                distances[chars[0]][chars[1]] = row[2]
                distances_count += 1
                
        count += 1
        
    print('Rows parsed: ', count)
    print('Distances loaded: ', distances_count)

Rows parsed:  14669236
Distances loaded:  81810


# Load syllable distances

In [6]:
syllable_distances = dict()
max_phon_distance = 0

syllable_distances_file = pathlib.Path('stimuli/distances.csv')
with syllable_distances_file.open('r') as f:
    csvreader = csv.reader(f)
    
    count = 0
    distances_count = 0
    for row in csvreader:
        if float(row[2]) > max_phon_distance:
            max_phon_distance = float(row[2])
        syls = sorted(row[:2])
        
        if syls[0] in syllables_set and syls[1] in syllables_set:
            if syls[0] not in syllable_distances:
                syllable_distances[syls[0]] = {
                    syls[1]: row[2]
                }
                distances_count += 1
            else:
                syllable_distances[syls[0]][syls[1]] = row[2]
                distances_count += 1
        count += 1
    print('Rows parsed: ', count)
    print('Distances loaded: ', distances_count)

Rows parsed:  2011015
Distances loaded:  33411


# Join trials

TODO: Investigate why there are a handful of missing comparisons

In [13]:
for i in range(1, len(df)+1):
    df.loc[i, 'critical_target_syllable'] = syllables[df.loc[i, 'critical_target']]
    df.loc[i, 'distractor_syllable'] = syllables[df.loc[i, 'distractor']]
    
    chars = sorted([df.loc[i, 'critical_target'], df.loc[i, 'distractor']])
    syls = sorted([df.loc[i, 'critical_target_syllable'], df.loc[i, 'distractor_syllable']])
    
    if chars[0] not in distances or chars[1] not in distances[chars[0]]:
        df.loc[i, 'orth_similarity'] = np.nan
        continue
    
    df.loc[i, 'orth_similarity'] = np.float64(distances[chars[0]][chars[1]])
    df.loc[i, 'phon_distance'] = np.float64(syllable_distances[syls[0]][syls[1]])
    
df

Unnamed: 0,trial,critical_target,critical_target_syllable,distractor,distractor_syllable,distractor_label,phon_sim,orth_sim,phon_distance,orth_similarity,phon_distance_normalized,orth_similarity_normalized,phon_distance_z,orth_similarity_z
1,1,柱,zhu4,炷,zhu4,both_sim,True,True,0.000000,0.833333,,,,
2,1,柱,zhu4,枉,wang3,orth_sim,False,True,1.593100,0.804444,,,,
3,1,柱,zhu4,淹,yan1,both_dif,False,False,1.634290,0.069589,,,,
4,1,柱,zhu4,述,shu4,phon_sim,True,False,0.044500,0.001111,,,,
5,2,文,wen2,汶,wen4,both_sim,True,True,0.150000,0.800000,,,,
6,2,文,wen2,这,zhe4,orth_sim,False,True,1.434195,0.800000,,,,
7,2,文,wen2,钙,gai4,both_dif,False,False,1.591810,0.037037,,,,
8,2,文,wen2,烷,wan2,phon_sim,True,False,0.129500,0.034632,,,,
9,3,殃,yang1,秧,yang1,both_sim,True,True,0.000000,0.716667,,,,
10,3,殃,yang1,映,ying4,orth_sim,False,True,0.585855,0.833333,,,,


## Normalize similarity and distancevalues

In [14]:
df['orth_similarity_normalized'] = df['orth_similarity'] / max_orth_similarity
df['phon_distance_normalized'] = df['phon_distance'] / max_phon_distance

# Compute z-scores

In [21]:
df['orth_similarity_z'] = (df['orth_similarity_normalized']-df['orth_similarity_normalized'].mean())/df['orth_similarity_normalized'].std()
df['phon_distance_z'] = (df['phon_distance_normalized']-df['phon_distance_normalized'].mean())/df['phon_distance_normalized'].std()

df

Unnamed: 0,trial,critical_target,critical_target_syllable,distractor,distractor_syllable,distractor_label,phon_sim,orth_sim,phon_distance,orth_similarity,phon_distance_normalized,orth_similarity_normalized,phon_distance_z,orth_similarity_z
1,1,柱,zhu4,炷,zhu4,both_sim,True,True,0.000000,0.833333,0.000000,0.882353,-1.062884,1.476122
2,1,柱,zhu4,枉,wang3,orth_sim,False,True,1.593100,0.804444,0.764350,0.851765,1.077205,1.390421
3,1,柱,zhu4,淹,yan1,both_dif,False,False,1.634290,0.069589,0.784112,0.073683,1.132537,-0.789602
4,1,柱,zhu4,述,shu4,phon_sim,True,False,0.044500,0.001111,0.021351,0.001176,-1.003105,-0.992748
5,2,文,wen2,汶,wen4,both_sim,True,True,0.150000,0.800000,0.071968,0.847059,-0.861381,1.377236
6,2,文,wen2,这,zhe4,orth_sim,False,True,1.434195,0.800000,0.688109,0.847059,0.863740,1.377236
7,2,文,wen2,钙,gai4,both_dif,False,False,1.591810,0.037037,0.763731,0.039216,1.075472,-0.886171
8,2,文,wen2,烷,wan2,phon_sim,True,False,0.129500,0.034632,0.062133,0.036669,-0.888920,-0.893305
9,3,殃,yang1,秧,yang1,both_sim,True,True,0.000000,0.716667,0.000000,0.758824,-1.062884,1.130019
10,3,殃,yang1,映,ying4,orth_sim,False,True,0.585855,0.833333,0.281086,0.882353,-0.275876,1.476122


# Compute aggregate score

In [53]:
for phon_sim in {True, False}:
    for orth_sim in {True, False}:
        orth_multiplier =  1 if orth_sim else -1
        phon_multiplier = -1 if phon_sim else 1
        
        df.loc[(df2['phon_sim'] == phon_sim) & (df['orth_sim'] == orth_sim), 'agg_score'] = \
            ((phon_multiplier * df['phon_distance_z']) + ( orth_multiplier * df['orth_similarity_z'])) / 2

In [106]:
phon_ttest = stats.ttest_ind(
    df.loc[(df['phon_sim'] == False) & np.isfinite(df['phon_distance_z'])]['phon_distance_z'],
    df.loc[(df['phon_sim'] == True) & np.isfinite(df['phon_distance_z'])]['phon_distance_z']
)

phon_df = len(df.loc[np.isfinite(df['phon_distance_z'])]['phon_distance_z'])

phon_sim_mean = df.loc[(df['phon_sim'] == True) & np.isfinite(df['phon_distance_z'])]['phon_distance_z'].mean()
phon_dif_mean = df.loc[(df['phon_sim'] == False) & np.isfinite(df['phon_distance_z'])]['phon_distance_z'].mean()

phon_sim_std = df.loc[(df['phon_sim'] == True) & np.isfinite(df['phon_distance_z'])]['phon_distance_z'].std()
phon_dif_std = df.loc[(df['phon_sim'] == False) & np.isfinite(df['phon_distance_z'])]['phon_distance_z'].std()
    
print(
f'There was a significant difference in the phonological distance from the critical target '
f'for phonologically similar (M={phon_sim_mean:.3}, SD={phon_sim_std:.2}) '
f'and phonologically dissimilar (M={phon_dif_mean:.3}, SD={phon_dif_std:.2}) '
f'distractors; t({phon_df})={phon_ttest.statistic:.3}, p<{phon_ttest.pvalue:.3f}.'
)

There was a significant difference in the phonological distance from the critical target for phonologically similar (M=-0.937, SD=0.13) and phonologically dissimilar (M=0.917, SD=0.51) distractors; t(370)=47.7, p<0.000.


In [107]:
orth_ttest = stats.ttest_ind(
    df.loc[(df['orth_sim'] == False) & np.isfinite(df['orth_similarity_z'])]['orth_similarity_z'],
    df.loc[(df['orth_sim'] == True) & np.isfinite(df['orth_similarity_z'])]['orth_similarity_z']
)

orth_df = len(df.loc[np.isfinite(df['orth_similarity_z'])]['orth_similarity_z'])

orth_sim_mean = df.loc[(df['orth_sim'] == True) & np.isfinite(df['orth_similarity_z'])]['orth_similarity_z'].mean()
orth_dif_mean = df.loc[(df['orth_sim'] == False) & np.isfinite(df['orth_similarity_z'])]['orth_similarity_z'].mean()

orth_sim_std = df.loc[(df['orth_sim'] == True) & np.isfinite(df['orth_similarity_z'])]['orth_similarity_z'].std()
orth_dif_std = df.loc[(df['orth_sim'] == False) & np.isfinite(df['orth_similarity_z'])]['orth_similarity_z'].std()

print(
f'There was a significant difference in the orthographic distance from the critical target '
f'for orthographically similar (M={orth_sim_mean:.3}, SD={orth_sim_std:.2}) '
f'and orthographically dissimilar (M={orth_dif_mean:.3}, SD={orth_dif_std:.2}) '
f'distractors; t({orth_df})={orth_ttest.statistic:.3}, p<{orth_ttest.pvalue:.3f}.'
)

There was a significant difference in the orthographic distance from the critical target for orthographically similar (M=0.855, SD=0.71) and orthographically dissimilar (M=-0.845, SD=0.23) distractors; t(370)=-31.1, p<0.000.


In [105]:
orth_ttest = stats.ttest_ind(
    df.loc[(df['orth_sim'] == False) & np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized'],
    df.loc[(df['orth_sim'] == True) & np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized']
)

orth_df = len(df.loc[np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized'])

orth_sim_mean = df.loc[(df['orth_sim'] == True) & np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized'].mean()
orth_dif_mean = df.loc[(df['orth_sim'] == False) & np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized'].mean()

orth_sim_std = df.loc[(df['orth_sim'] == True) & np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized'].std()
orth_dif_std = df.loc[(df['orth_sim'] == False) & np.isfinite(df['orth_similarity_normalized'])]['orth_similarity_normalized'].std()

print(
f'There was a significant difference in the orthographic distance from the critical target '
f'for orthographically similar (M={orth_sim_mean:.3}, SD={orth_sim_std:.3}) '
f'and orthographically dissimilar (M={orth_dif_mean:.3}, SD={orth_dif_std:.3}) '
f'distractors; t({orth_df})={orth_ttest.statistic:.3}, p<{orth_ttest.pvalue:.3f}.'
)

There was a significant difference in the orthographic distance from the critical target for orthographically similar (M=0.661, SD=0.253) and orthographically dissimilar (M=0.0538, SD=0.0816) distractors; t(370)=-31.1, p<0.000.


# Load SUBTLEX

In [154]:
subtlex_chr_file = pathlib.Path('/Users/nick/github/orthophonology/data/subtlex-ch/SUBTLEX-CH-CHR.csv')
with subtlex_chr_file.open('r') as f:
    subtlex_total_chr = next(f).split(': ')[1].replace(',', '').strip()
#     print(subtlex_total_chr)
    subtlex_total_chr_contexts = next(f).split(': ')[1].replace(',', '').strip()
#     print(subtlex_total_chr_contexts)
    
    subtlex = pd.read_csv(f, sep='\t')
subtlex.columns = ['character', 'count', 'per_million', 'log_count', 'context_count', 'context_percent', 'log_context_count']
subtlex

Unnamed: 0,character,count,per_million,log_count,context_count,context_percent,log_context_count
0,我,2058980,43956.70,6.3137,6242,99.98,3.7953
1,的,1762079,37618.23,6.2460,6243,100.00,3.7954
2,你,1444831,30845.37,6.1598,6242,99.98,3.7953
3,是,1172098,25022.86,6.0690,6243,100.00,3.7954
4,了,993990,21220.47,5.9974,6243,100.00,3.7954
5,不,904675,19313.70,5.9565,6243,100.00,3.7954
6,们,701754,14981.59,5.8462,6243,100.00,3.7954
7,这,633842,13531.75,5.8020,6243,100.00,3.7954
8,一,604056,12895.86,5.7811,6240,99.95,3.7952
9,他,564477,12050.89,5.7516,6242,99.98,3.7953


# Add frequency data to DataFrame

In [157]:
for i in range(1, len(df)+1):
    if len(subtlex.loc[subtlex['character'] == df.loc[i, 'distractor']]['log_count']):
            df.loc[i, 'subtlex_log_count'] = np.float(subtlex.loc[subtlex['character'] == df.loc[i, 'distractor']]['log_count'])
            df.loc[i, 'subtlex_per_million'] = np.float(subtlex.loc[subtlex['character'] == df.loc[i, 'distractor']]['per_million'])
    else:
        df.loc[i, 'subtlex_log_count'] = np.float(0)
        df.loc[i, 'subtlex_per_million'] = np.float(0)
        
df['subtlex_log_per_million'] = df['subtlex_per_million'].apply(np.log10)
            
df.groupby('distractor_label').describe()[['subtlex_log_count', 'subtlex_per_million', 'subtlex_log_per_million']]

Unnamed: 0_level_0,Unnamed: 1_level_0,subtlex_log_count,subtlex_per_million,subtlex_log_per_million
distractor_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
both_dif,count,96.0,96.0,96.0
both_dif,mean,2.329747,103.276979,0.656941
both_dif,std,1.220735,340.217519,1.224251
both_dif,min,0.301,0.04,-1.39794
both_dif,25%,1.434875,0.5825,-0.235318
both_dif,50%,2.2135,3.49,0.542824
both_dif,75%,3.2254,35.87,1.554727
both_dif,max,5.0228,2250.2,3.352221
both_sim,count,96.0,96.0,96.0
both_sim,mean,2.199019,51.817917,0.527132


In [159]:
for i in range(1, len(df)+1):
    df.loc[i, 'critical_subtlex_log_count'] = np.float(subtlex.loc[subtlex['character'] == df.loc[i, 'critical_target']]['log_count'])
    df.loc[i, 'critical_subtlex_per_million'] = np.float(subtlex.loc[subtlex['character'] == df.loc[i, 'critical_target']]['per_million'])

df['critical_subtlex_log_per_million'] = df['critical_subtlex_per_million'].apply(np.log10)
df.loc[df['distractor_label'] == 'both_sim'].describe()[['critical_subtlex_log_count', 'critical_subtlex_per_million', 'critical_subtlex_log_per_million']]



Unnamed: 0,critical_subtlex_log_count,critical_subtlex_per_million,critical_subtlex_log_per_million
count,96.0,96.0,96.0
mean,3.075847,133.221875,1.405275
std,0.856883,275.378292,0.856707
min,0.8451,0.15,-0.823909
25%,2.4829,6.49,0.812245
50%,2.94645,18.87,1.275758
75%,3.690025,104.5925,2.019365
max,4.8435,1488.97,3.172886


In [161]:
df.groupby(['phon_sim', 'orth_sim']).describe()[['orth_similarity', 'phon_distance', 'orth_similarity_z', 'phon_distance_z']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,orth_similarity,phon_distance,orth_similarity_z,phon_distance_z
phon_sim,orth_sim,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,False,count,94.0,94.0,94.0,94.0
False,False,mean,0.047558,1.633147,-0.85496,1.131001
False,False,std,0.051278,0.130766,0.152122,0.175664
False,False,min,0.0,1.228185,-0.996045,0.586997
False,False,25%,0.018689,1.559906,-0.940602,1.032614
False,False,50%,0.037037,1.63461,-0.886171,1.132967
False,False,75%,0.059062,1.7133,-0.82083,1.238675
False,False,max,0.287139,1.889805,-0.144219,1.475783
False,True,count,93.0,93.0,93.0,93.0
False,True,mean,0.614272,1.312163,0.826254,0.699808
