In [None]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../data/all_evals_clean.csv', dtype = {'Child ID':object})
data = data.set_index('Index')
data.head(20)

In [None]:
data_desc = data.describe()
data_desc

### We need to make the date columns datetime64 instead of objects

In [None]:
date_cols = []
pattern = r'Date of'

for col in data.columns:
    if re.search(pattern,col):
        date_cols.append(col)
#end

In [None]:
len(date_cols)

In [None]:
data[date_cols] = data[date_cols].apply(pd.to_datetime)
list(data.dtypes)

### Let's define a slice function to make sub-tables based on domain

In [None]:
def slice_data(data, domain):
    
    pattern = re.compile(domain)
    
    new_cols = []
    
    for col in data.columns:
        if (re.search(pattern,col)):
            new_cols.append(col)
    #end
    
    new_df = data[new_cols]
    
    return new_df

In [None]:
domains = ['Adaptive', 'Social', 'Communication', 'Motor', 'Cognitive', 'Total']

In [None]:
df_dict = {}

for domain in domains:
    df = slice_data(data,domain)
    df_dict[domain] = df

In [None]:
adapt = df_dict['Adaptive']
soc   = df_dict['Social']
comm  = df_dict['Communication']
motor = df_dict['Motor']
cogn  = df_dict['Cognitive']
total = df_dict['Total']

In [None]:
adapt.describe()

In [None]:
adapt

# 4. Does the child's age seem to impact their scoring?

We need to determine how to get the child's age at the time of testing. To do this, we need to subtract their birthday from the date of testing. However, this can be tricky because some testing was done on multiple days.

For now, let's just get a simple case to work (that is, not worry about the different test dates just yet).

### First, let's get everyone's ages

In [None]:
ages_list = []

for ind, values in data.iterrows():
    date_col = 2
    #print('DATA:',ind, data.loc[ind,'Date of Birth'], data.loc[ind,'Adaptive-Self Care Date of Testing'],
    #      '\n',type(data.loc[ind,'Date of Birth']), type(data.loc[ind,'Adaptive-Self Care Date of Testing']))
    try:
        #print('TRY:',ind, data.loc[ind,'Date of Birth'], data.loc[ind,'Adaptive-Self Care Date of Testing'])
        birthday = data.loc[ind,'Date of Birth']
        #print('\tGot birthday')
        #testing  = datetime.datetime.strptime(data.loc[ind,'Adaptive-Self Care Date of Testing'],'%m/%d/%Y %H:%M:%S')
        testing = data.loc[ind,'Adaptive-Self Care Date of Testing']
        #print('\t',birthday, testing)
        #print('\t',type(birthday),type(testing))
        age = (testing - birthday).days
        ages_list.append(age)
        
    except:
        while date_col < 14:
            
            #print('EXCEPTION:',ind, data.loc[ind,'Date of Birth'], data.loc[ind,date_cols[date_col]],
            #      '\nTrying column',date_cols[date_col])
            
            if type(data.loc[ind,date_cols[date_col]]) == str:
                #print(ind,data.loc[ind,'Date of Birth'], data.loc[ind,date_cols[date_col]])
                birthday = data.loc[ind,'Date of Birth']
                #print('\tGot birthday')
                testing  = data.loc[ind,date_cols[date_col]]
                #print('\t',birthday, testing)
                #print('\t',type(birthday),type(testing))
                age = (testing - birthday).days
                ages_list.append(age)
                #print('CLEAR')
                break

            else:
                date_col += 1
        
        ages_list.append(np.nan)
        #print('CLEAR: NULL')

In [None]:
ages = np.asarray(ages_list)
ages

In [None]:
ages_year = np.round(ages/365,1)
ages_year

In [None]:
data['Age'] = ages_year
data

### Next, let's get some stats in there!

In [None]:
data.value_counts('Age')

In [None]:
age_desc = data.groupby('Age').mean(numeric_only = True)
age_desc

In [None]:
### Setting our color palatte!
colors = ["#C37681", "#5BAC82", "#838385","#89C1DF","#EAE086"]

# Set your custom color palette
sns.set_palette(sns.color_palette(colors))

### Sum of Scaled Scores

In [None]:
sns.set_style('whitegrid')

plt.figure(figsize=(20,12))

sns.lineplot(x = age_desc.index, y = 'Adaptive Sum of Scaled Scores', data = age_desc, linestyle = 'dashed',
             label = 'Adaptive');
sns.lineplot(x = age_desc.index, y = 'Social-Emotional Sum of Scaled Scores', data = age_desc, linestyle = 'dashed',
             label = 'Social-Emotional');
sns.lineplot(x = age_desc.index, y = 'Communication Sum of Scaled Scores', data = age_desc, linestyle = 'dashed',
             label = 'Communication');
sns.lineplot(x = age_desc.index, y = 'Motor Sum of Scaled Scores', data = age_desc, linestyle = 'dashed',
             label = 'Motor');
sns.lineplot(x = age_desc.index, y = 'Cognitive Sum of Scaled Scores', data = age_desc, linestyle = 'dashed',
             label = 'Cognitive');

plt.ylabel('Average Sum of Scaled Scores', size=20)
plt.xlabel('Age',size=20)
plt.title('Average Sum of Scaled Scores by Age', size=25)

plt.legend(fontsize = 15)

plt.show()

### Percentile Rank

In [None]:
sns.set_style('whitegrid')

fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot()

sns.lineplot(x = age_desc.index, y = 'Adaptive Percentile Rank', data = age_desc, linestyle = 'solid', linewidth = 5,
             label = 'Adaptive');
sns.lineplot(x = age_desc.index, y = 'Social-Emotional Percentile Rank', data = age_desc, linestyle = 'solid', linewidth = 5,
             label = 'Social-Emotional');
sns.lineplot(x = age_desc.index, y = 'Communication Percentile Rank', data = age_desc, linestyle = 'solid', linewidth = 5,
             label = 'Communication');
sns.lineplot(x = age_desc.index, y = 'Motor Percentile Rank', data = age_desc, linestyle = 'solid', linewidth = 5,
             label = 'Motor');
sns.lineplot(x = age_desc.index, y = 'Cognitive Percentile Rank', data = age_desc, linestyle = 'solid', linewidth = 5,
             label = 'Cognitive');
#sns.lineplot(x = age_desc.index, y = 'BDI-3 Total Percentile Rank', data = age_desc, linestyle = 'solid',color='black',
#             label = 'Total');

plt.ylabel('Percentile Ranks', size=20)
plt.xlabel('Age (Years)',size=20)
plt.title('Percentile Ranks by Age', size=25)
ax.tick_params(axis = 'both', labelsize = 20)

plt.legend(fontsize = 25)

plt.savefig('../images/pr_by_age_all')

plt.show();

Between the ages of 1 and 4, Adaptive and Cognitive seem to be highly correlated. Let's look into that.

In [None]:
per_cols = []
pattern = r'Percentile Rank'

for col in age_desc.columns:
    if re.search(pattern,col):
        per_cols.append(col)

age_corr = age_desc[per_cols].corr()
age_corr

It seems that those two aren't correlated that much. The highest correlations belong to the Total Percentile Rank to the other columns (which makes sense because it's dependent on those other columns), except for Motor. It seems that Motor Percentile Rank is the least likely to impact the Total. I'm not sure why, but it could be something to look into.

### Let's make a heatmap!

In [None]:
sns.set_style('white')

fig, ax = plt.subplots(figsize=(6, 6));

# create a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Generate a mask for the upper triangle
mask = np.zeros_like(age_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(age_corr, cmap=cmap, mask = mask, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

plt.xlabels = (['Adaptive', 'Social-Emotional', 'Communication', 'Motor', 'Cognitive', 'Total']);
plt.ylabels = (['Adaptive', 'Social-Emotional', 'Communication', 'Motor', 'Cognitive', 'Total']);
plt.title('Domain PR Correlation Heatmap');

Now let's investigate separate domains

In [None]:
df = df_dict['Adaptive']

pattern = r'PR'
pr_cols = []

for col in df.columns:
    if re.search(pattern,col):
        pr_cols.append(col)
#end

domain_pr = df[pr_cols]
domain_pr.loc[:,'Age'] = data['Age']

#domain_pr.groupby('Age').mean()
domain_pr

In [None]:
domain_pr.loc[(~pd.isna(domain_pr['Adaptive-Personal Responsibility PR'])) & (domain_pr['Age']<2)]

### There is not a single child less than 2 years old who took the Personal Responsibility Test. This is evident by that fact that this is no Percentile Rank for this age group.

Let's see what other domains are like

In [None]:
def age_sub_domain(domain):
    
    df = df_dict[domain]
    
    pattern = r'PR'
    pr_cols = []

    for col in df.columns:
        if re.search(pattern,col):
            pr_cols.append(col)
    #end

    domain_pr = df[pr_cols]
    domain_pr.loc[:,'Age'] = data['Age']

    return domain_pr.groupby('Age').mean()

In [None]:
age_sub_domain('Adaptive')

### Again, we can see that anyone under 2 does not take the Personal Responsibility Test.

In [None]:
age_pr_subs = {}

for domain in domains:
    df = age_sub_domain(domain)
    age_pr_subs[domain] = df

In [None]:
age_pr_subs.keys()

In [None]:
adapt_age  = age_pr_subs['Adaptive']
social_age = age_pr_subs['Social']
comm_age   = age_pr_subs['Communication']
motor_age  = age_pr_subs['Motor']
cogn_age   = age_pr_subs['Cognitive']

In [None]:
social_age

In [None]:
comm_age

In [None]:
motor_age

In [None]:
cogn_age

## Note: Communication is the only domain where *all* ages are tested in all sub-domains.

This explains why the Sum of Scaled Scores jumps up for all domains at Age 2 *except* communication. 

In [None]:
domain_colors = {'Adaptive':['#D59FA6','#9C5E67'], 'Social':['#9CCDB4','#5bac82','#3F785B'],
                'Communication':['#B4B4B5','#4E4E4F'], 'Motor':['#C4DFEE','#89c1df','#527385'],
                'Cognitive':['#EAE086','#BBB36B','#8C8650']}

In [None]:
data.value_counts('Age').loc[4.0]

In [None]:
np.arange(41)/10

In [None]:
col = data['Adaptive-Self Care PR']
x = np.arange(41)/10
y = np.asarray(data[['Adaptive-Self Care PR','Age']].groupby('Age').mean()['Adaptive-Self Care PR'])
diff = np.asarray((1/(data.value_counts(['Age']))).to_frame().sort_index().rename(columns = {0:'diff'})['diff'])

y1 = y - diff
y2 = y + diff

In [None]:
for domain in domains:
    
    if domain == 'Total':
        break
    
    age_df = age_pr_subs[domain]
    
    sns.set_style('whitegrid')

    fig = plt.figure(figsize=(20,12))
    ax  = fig.add_subplot()
    
    
    grays = ['#d3d3d3', '#e0e0e0', '#ededed']
    
    i = 0
    for col in age_df.columns:
        
        sns.lineplot(x = age_df.index, y = col, data = age_df, linestyle = 'solid', linewidth = 5,
                     color = domain_colors[domain][i], label = col.split('-')[1]);

        x = np.arange(41)/10
        y = np.asarray(data[[col,'Age']].groupby('Age').mean()[col])
        diff = np.asarray((1/(data.value_counts(['Age']))).to_frame().sort_index().rename(columns = {0:'diff'})['diff'])*50

        y1 = y - diff
        y2 = y + diff
        
        plt.fill_between(x, y1 = y1, y2 = y2, color = grays[i])

        
        #data.value_counts('Age')
        
        
        i += 1

    plt.ylabel('Sub-Domain PR', size=20)
    plt.xlabel('Age (Years)',size=20)
    ax.tick_params(axis = 'both', labelsize = 20)
    
    
    if domain != 'Social':
        plt.title(f'{domain} Percentile Ranks by Age', size=25)
    else:
        plt.title('Social-Emotional Percentile Ranks by Age', size=25)

    plt.legend(fontsize = 20)

    plt.savefig(f'../images/pr_by_age_{domain}')
    
    plt.show();
    print('\n\n')

### I'm now taking a detour to Question 1 so that we can get a visualization for it.

In [None]:
pr_cols = []
pattern = r'PR'

for col in data.columns:
    if re.search(pattern,col):
        pr_cols.append(col)
#end

pr_data = data[pr_cols]

In [None]:
pr_data

In [None]:
fig, ax = plt.subplots(2,3, figsize=(12,8))

for col in pr_data.columns:
    domain = col.split('-')[0].split(' ')[0]
    color = domain_colors[domain][1]
    
    sns.boxplot(pr_data[col], color = color)

### Never mind. Let's try something else...

In [None]:
comm

In [None]:
pr_cols = []
pattern  = r'PR'
pattern2 = r'Percentile Rank'

for col in comm.columns:
    if re.search(pattern,col) or re.search(pattern2,col):
        pr_cols.append(col)
#end

pr_comm = comm[pr_cols]
pr_comm

In [None]:
pr_comm.describe()

In [None]:
fig, ax = plt.subplots(1, figsize=(20,12))

grays = ['#d3d3d3', '#e0e0e0', '#ededed']

sns.boxplot(data=pr_comm, palette = grays)
ax.tick_params(axis = 'both', labelsize = 17)
plt.ylabel('Percentile Rank', size = 20);

plt.savefig('../images/communication_pr.png')