In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
aeps_clean = pd.read_csv('../data/aeps_cleansed_data.csv')

In [None]:
aeps_clean.head()

## getting things into correct dtype

In [None]:
clean_oc2 = aeps_clean[['Child ID', 'Program Name', 'AEPSi ID', 'DOB', 'Gender', 'Dev Status', 'AEPS Level', 'Test Date', 'Examiner', 'Service Coordinator', 'TEIS Point of Entry Office (POE)', 'ESL', 'County of Residence', 'Number of Items', 'fm_B4.0', 'fm_B5.0', 'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0', 'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0', 'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0', 'FM Raw Score', 'FM Possible Score', 'FM Percentage', 'GM Raw Score', 'GM Possible Score', 'GM Percentage', 'Adapt Raw Score', 'Adapt Possible Score', 'Adapt Percentage', 'Cog Raw Score', 'Cog Possible Score', 'Cog Percentage', 'SC Raw Score', 'SC Possible Score', 'SC Percentage', 'Soc Raw Score', 'Soc Possible Score', 'Soc Percentage', 'Overall Raw Score', 'Overall Possible Score', 'Overall Percentage', 'FM Goal Score', 'FM Cutoff', 'FM Result', 'GM Goal Score', 'GM Cutoff', 'GM Result', 'Adapt Goal Score', 'Adapt Cutoff', 'Adapt Result', 'Cog Goal Score', 'Cog Cutoff', 'Cog Result', 'SC Goal Score', 'SC Cutoff', 'SC Result', 'Soc Goal Score', 'Soc Cutoff', 'Soc Result']]

In [None]:
clean_oc2.head()

In [None]:
clean_oc2.shape

In [None]:
pd.set_option('display.max_rows', None)
clean_oc2.dtypes

In [None]:
clean_oc2['Test Date'] = pd.to_datetime(clean_oc2['Test Date'])

In [None]:
clean_oc2 = clean_oc2.dropna(subset=['Test Date'])

In [None]:
clean_oc2['Child ID'] = clean_oc2['Child ID'].astype(str).str.strip()

In [None]:
clean_oc2 = clean_oc2.rename(columns = {'Test Date': 'test_date'})

## remove child ids where child was in the program less than 

In [None]:
test_dates = clean_oc2.groupby('Child ID')['test_date'].agg(['min', 'max']).reset_index()
test_dates

In [None]:
clean_oc2 = pd.merge(test_dates, clean_oc2, on = 'Child ID', how = 'outer')

In [None]:
clean_oc2.head()

In [None]:
clean_oc2.dtypes

## make column that is difference between first and last test date in days

In [None]:
clean_oc2['test_date_diff'] = clean_oc2['max'] - clean_oc2['min']

In [None]:
clean_oc2.head()

In [None]:
clean_oc2.test_date_diff.dtype

In [None]:
## change the above dtype to int, maybe duplicate notebook w/o this line to compare, because there is a big diff
clean_oc2['test_date_diff'] = clean_oc2['test_date_diff'].dt.days.astype('int16')

In [None]:
clean_oc2.test_date_diff.dtype

In [None]:
clean_oc2.head()

In [None]:
clean_oc2.test_date_diff.describe()

In [None]:
clean_oc2.test_date_diff.unique()

## only include rows with date diff over 183

In [None]:
# can comment this out to look at all child ids no matter length in program
clean_oc2 = clean_oc2[clean_oc2.test_date_diff >183]

In [None]:
clean_oc2.head()

In [None]:
clean_oc2.shape

In [None]:
clean_oc2.test_date_diff.min()

In [None]:
clean_oc2.test_date_diff.max()

In [None]:
clean_oc2.test_date_diff.unique()

## make column with time in program groups

In [None]:
years_in_program = pd.cut(clean_oc2.test_date_diff, bins = [184,366,549,732,983], labels=['0.5-1', '1-1.5', '1.5-2', '2-2.7'])
clean_oc2.insert(3, 'years_in_program', years_in_program)

In [None]:
clean_oc2.columns.unique()

In [None]:
clean_oc2.head()

## first and last dates list

In [None]:
first_test_list = clean_oc2.groupby('Child ID')['test_date'].min().reset_index()

In [None]:
first_test_list.shape

In [None]:
first_test_list.head()

In [None]:
first_test_data = pd.merge(first_test_list, clean_oc2, how = 'inner', on =['Child ID', 'test_date'])

In [None]:
first_test_data.shape

In [None]:
first_test_data.head()

In [None]:
first_test_data.shape

In [None]:
last_test_list = clean_oc2.groupby('Child ID')['test_date'].max().reset_index()

In [None]:
last_test_list.head()

In [None]:
last_test_data = pd.merge(last_test_list, clean_oc2, how ='inner', on = ['Child ID', 'test_date'])

In [None]:
last_test_data.shape

## graph and table of first test scores

In [None]:
first_test_scores = first_test_data[['fm_B4.0', 'fm_B5.0',
       'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0',
       'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0',
       'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0']].apply(pd.Series.value_counts).reset_index()

In [None]:
first_test_scores.head()

In [None]:
first_test_scores = first_test_scores.rename(columns = {'index': 'score'})

In [None]:
first_tests_melt = first_test_scores.melt('score')

In [None]:
first_tests_melt.shape

In [None]:
first_tests_melt.head()

## add domain column to melt df for legend of graph

In [None]:
domain = {
        'item': ['fm_B4.0', 'fm_B5.0','cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0',
       'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0','sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0'],
        'domain': ['fine motor', 'fine motor', 'cognitive', 'cognitive', 'cognitive', 'cognitive', 'cognitive', 'cognitive',
                  'cognitive', 'cognitive', 'cognitive', 'cognitive', 'social communication', 'social communication','social communication',
                  'social communication','social communication']
}

In [None]:
domain = pd.DataFrame(domain)

In [None]:
domain.head()

In [None]:
first_tests_melt = first_tests_melt.rename(columns = {'variable': 'item', 'value': 'count_first'})

In [None]:
first_tests_melt.head()

In [None]:
first_tests_melt = pd.merge(first_tests_melt, domain, on = 'item', how = 'outer')

In [None]:
first_tests_melt.head()

In [None]:
sns.barplot(data=first_tests_melt, x='score', y='count_first', hue = 'item', 
             palette = ['indianred', 'red', 'aquamarine', 'lightseagreen', 'mediumturquoise', 'lightcyan', 'paleturquoise', 'darkslategrey', 'teal', 'darkcyan', 'cyan', 'dodgerblue', 'forestgreen', 'darkgreen', 'limegreen', 'palegreen', 'olivedrab'])

plt.title('First Test Scores by Item')
plt.legend(bbox_to_anchor = (1,1))
plt.ylabel('Score Count');

In [None]:
title_font = {'family': 'serif', 'weight': 'bold', 'size': 20, 'color' : '#FF0000'}
axes_font = {'family': 'serif', 'weight': 'bold', 'size': 14, 'color' : '#FF0000'}

In [None]:
# i want the below legend with this graph
plt.figure(figsize = (10,6))
sns.barplot(data=first_tests_melt, x='score', y='count_first', hue = 'item', 
             palette =  ['#c9cdd6', '#c9cdd6', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#394768', '#394768', '#394768', '#394768', '#394768'],
                    edgecolor = 'black').set_title('First Test Scores by Item', fontdict= title_font)

plt.xlabel('Score', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
plt.ylabel('Score Count', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
plt.legend([],[], frameon=False)
plt.ylim(0,3500);

#plt.savefig('../data/first_scores_by_item.png')

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data=first_tests_melt, x='score', y='count_first', hue = 'domain', 
             palette =  ['#c9cdd6', '#818af9', '#394768'],
                    edgecolor = 'black').set_title('First Test Scores by Item', fontdict= title_font)

plt.xlabel('Score', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
plt.ylabel('Score Count', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
plt.legend(bbox_to_anchor = (1,1), title = 'Domain');

#plt.savefig('../data/use_legend_only.png')

In [None]:
sns.barplot(data=first_tests_melt, x='score', y='count_first', hue = 'item', 
             palette = ['indianred', 'red', 'aquamarine', 'lightseagreen', 'mediumturquoise', 'lightcyan', 'paleturquoise', 'darkslategrey', 'teal', 'darkcyan', 'cyan', 'dodgerblue', 'forestgreen', 'darkgreen', 'limegreen', 'palegreen', 'olivedrab'])

plt.title('First Test Scores by Item')
plt.legend(bbox_to_anchor = (1,1))
plt.ylabel('Score Count');

In [None]:
#first_test_scores with percentages
first_test_scores['fm_b4_percent'] = first_test_scores['fm_B4.0']/first_test_scores['fm_B4.0'].sum() *100
first_test_scores['fm_b5_percent'] = first_test_scores['fm_B5.0']/first_test_scores['fm_B5.0'].sum() *100
first_test_scores['cog_d2_percent'] = first_test_scores['cog_D2.0']/first_test_scores['cog_D2.0'].sum() *100
first_test_scores['cog_e2_percent'] = first_test_scores['cog_E2.0']/first_test_scores['cog_E2.0'].sum() *100
first_test_scores['cog_e4_percent'] = first_test_scores['cog_E4.0']/first_test_scores['cog_E4.0'].sum() *100
first_test_scores['cog_f1_percent'] = first_test_scores['cog_F1.0']/first_test_scores['cog_F1.0'].sum() *100
first_test_scores['cog_g1_percent'] = first_test_scores['cog_G1.0']/first_test_scores['cog_G1.0'].sum() *100
first_test_scores['cog_g2_percent'] = first_test_scores['cog_G2.0']/first_test_scores['cog_G2.0'].sum() *100
first_test_scores['cog_g3_percent'] = first_test_scores['cog_G3.0']/first_test_scores['cog_G3.0'].sum() *100
first_test_scores['cog_g4_percent'] = first_test_scores['cog_G4.0']/first_test_scores['cog_G4.0'].sum() *100
first_test_scores['cog_g5_percent'] = first_test_scores['cog_G5.0']/first_test_scores['cog_G5.0'].sum() *100
first_test_scores['cog_g6_percent'] = first_test_scores['cog_G6.0']/first_test_scores['cog_G6.0'].sum() *100
first_test_scores['sc_b1_percent'] = first_test_scores['sc_B1.0']/first_test_scores['sc_B1.0'].sum() *100
first_test_scores['sc_b2_percent'] = first_test_scores['sc_B2.0']/first_test_scores['sc_B2.0'].sum() *100
first_test_scores['sc_d1_percent'] = first_test_scores['sc_D1.0']/first_test_scores['sc_D1.0'].sum() *100
first_test_scores['sc_d2_percent'] = first_test_scores['sc_D2.0']/first_test_scores['sc_D2.0'].sum() *100
first_test_scores['sc_d3_percent'] = first_test_scores['sc_D3.0']/first_test_scores['sc_D3.0'].sum() *100

In [None]:
first_test_scores.head()

In [None]:
last_test_scores = last_test_data[['fm_B4.0', 'fm_B5.0',
       'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0',
       'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0',
       'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0']].apply(pd.Series.value_counts).reset_index()

In [None]:
last_test_scores.head()

In [None]:
last_test_scores = last_test_scores.rename(columns = {'index': 'score'})

In [None]:
last_test_scores.head()

In [None]:
last_tests_melt = last_test_scores.melt('score')

In [None]:
last_tests_melt.head()

In [None]:
last_tests_melt = last_tests_melt.rename(columns = {'variable': 'item', 'value': 'count_last'})

In [None]:
sns.barplot(data=last_tests_melt, x='score', y='count_last', hue = 'item',
            palette = ['indianred', 'red', 'aquamarine', 'lightseagreen', 'mediumturquoise', 'lightcyan', 'paleturquoise', 'darkslategrey', 'teal', 'darkcyan', 'cyan', 'dodgerblue', 'forestgreen', 'darkgreen', 'limegreen', 'palegreen', 'olivedrab'])

plt.title('Last Test Scores by Item')
plt.legend(bbox_to_anchor = (1,1))
plt.ylabel('Score Count');

In [None]:
#use this format, make legend that says what each domain is(red, blue, grey)
plt.figure(figsize = (10,6))
sns.barplot(data=last_tests_melt, x='score', y='count_last', hue = 'item',
            palette = ['#c9cdd6', '#c9cdd6', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#818af9', '#394768', '#394768', '#394768', '#394768', '#394768'],
            edgecolor = 'black').set_title('Last Test Scores by Item', fontdict= title_font)

plt.legend([],[], frameon=False)
plt.xlabel('Score', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
plt.ylabel('Score Count', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
plt.ylim(0,3500)
;

#plt.savefig('../data/last_scores_by_item.png')

In [None]:
#last_test_scores with percentages
last_test_scores['fm_b4_percent'] = last_test_scores['fm_B4.0']/last_test_scores['fm_B4.0'].sum() *100
last_test_scores['fm_b5_percent'] = last_test_scores['fm_B5.0']/last_test_scores['fm_B5.0'].sum() *100
last_test_scores['cog_d2_percent'] = last_test_scores['cog_D2.0']/last_test_scores['cog_D2.0'].sum() *100
last_test_scores['cog_e2_percent'] = last_test_scores['cog_E2.0']/last_test_scores['cog_E2.0'].sum() *100
last_test_scores['cog_e4_percent'] = last_test_scores['cog_E4.0']/last_test_scores['cog_E4.0'].sum() *100
last_test_scores['cog_f1_percent'] = last_test_scores['cog_F1.0']/last_test_scores['cog_F1.0'].sum() *100
last_test_scores['cog_g1_percent'] = last_test_scores['cog_G1.0']/last_test_scores['cog_G1.0'].sum() *100
last_test_scores['cog_g2_percent'] = last_test_scores['cog_G2.0']/last_test_scores['cog_G2.0'].sum() *100
last_test_scores['cog_g3_percent'] = last_test_scores['cog_G3.0']/last_test_scores['cog_G3.0'].sum() *100
last_test_scores['cog_g4_percent'] = last_test_scores['cog_G4.0']/last_test_scores['cog_G4.0'].sum() *100
last_test_scores['cog_g5_percent'] = last_test_scores['cog_G5.0']/last_test_scores['cog_G5.0'].sum() *100
last_test_scores['cog_g6_percent'] = last_test_scores['cog_G6.0']/last_test_scores['cog_G6.0'].sum() *100
last_test_scores['sc_b1_percent'] = last_test_scores['sc_B1.0']/last_test_scores['sc_B1.0'].sum() *100
last_test_scores['sc_b2_percent'] = last_test_scores['sc_B2.0']/last_test_scores['sc_B2.0'].sum() *100
last_test_scores['sc_d1_percent'] = last_test_scores['sc_D1.0']/last_test_scores['sc_D1.0'].sum() *100
last_test_scores['sc_d2_percent'] = last_test_scores['sc_D2.0']/last_test_scores['sc_D2.0'].sum() *100
last_test_scores['sc_d3_percent'] = last_test_scores['sc_D3.0']/last_test_scores['sc_D3.0'].sum() *100

In [None]:
last_test_scores

## look at change in scores from first test to last test

In [None]:
compare_first_last = pd.merge(first_tests_melt, last_tests_melt, on = ['score', 'item'], how = 'outer')

In [None]:
compare_first_last.head()

In [None]:
compare_first_last['last_minus_first'] = compare_first_last['count_last'] - compare_first_last['count_first']

In [None]:
compare_first_last.head()

## changes in scores by percent

In [None]:
#taking only the percentages of scores to make new melt
first_percents = first_test_scores.drop(columns = ['fm_B4.0', 'fm_B5.0',
       'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0',
       'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0',
       'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0'])

In [None]:
first_percents.head()

In [None]:
first_percents = first_percents.rename(columns = {'fm_b4_percent': 'fm_B4', 'fm_b5_percent': 'fm_B5',
       'cog_d2_percent': 'cog_D2', 'cog_e2_percent': 'cog_E2', 'cog_e4_percent': 'cog_E4', 'cog_f1_percent': 'cog_F1', 'cog_g1_percent': 'cog_G1',
       'cog_g2_percent': 'cog_G2', 'cog_g3_percent': 'cog_G3', 'cog_g4_percent': 'cog_G4', 'cog_g5_percent': 'cog_G5', 'cog_g6_percent': 'cog_G6',
       'sc_b1_percent': 'sc_B1', 'sc_b2_percent': 'sc_B2', 'sc_d1_percent': 'sc_D1', 'sc_d2_percent': 'sc_D2', 'sc_d3_percent': 'sc_D3'})

In [None]:
first_percents.head(1)

In [None]:
last_percents = last_test_scores.drop(columns = ['fm_B4.0', 'fm_B5.0',
       'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0',
       'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0',
       'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0'])

In [None]:
last_percents.head()

In [None]:
last_percents = last_percents.rename(columns = {'fm_b4_percent': 'fm_B4', 'fm_b5_percent': 'fm_B5',
       'cog_d2_percent': 'cog_D2', 'cog_e2_percent': 'cog_E2', 'cog_e4_percent': 'cog_E4', 'cog_f1_percent': 'cog_F1', 'cog_g1_percent': 'cog_G1',
       'cog_g2_percent': 'cog_G2', 'cog_g3_percent': 'cog_G3', 'cog_g4_percent': 'cog_G4', 'cog_g5_percent': 'cog_G5', 'cog_g6_percent': 'cog_G6',
       'sc_b1_percent': 'sc_B1', 'sc_b2_percent': 'sc_B2', 'sc_d1_percent': 'sc_D1', 'sc_d2_percent': 'sc_D2', 'sc_d3_percent': 'sc_D3'})

In [None]:
last_percents.head(1)

In [None]:
first_percents_melt = first_percents.melt('score')

In [None]:
first_percents_melt = first_percents_melt.rename(columns = {'variable': 'item', 'value': 'percent_first'})

In [None]:
last_percents_melt = last_percents.melt('score')

In [None]:
last_percents_melt.head()

In [None]:
last_percents_melt = last_percents_melt.rename(columns = {'variable': 'item', 'value': 'percent_last'})

In [None]:
compare_percents = pd.merge(first_percents_melt, last_percents_melt, on = ['score', 'item'], how = 'outer')

In [None]:
compare_percents.head()

In [None]:
compare_percents['last_minus_first'] = compare_percents['percent_last'] - compare_percents['percent_first']

In [None]:
compare_percents.head()

In [None]:
#figure out how to pull up title to not overlap subtitle
percent_diff_grid = sns.FacetGrid(compare_percents, col = 'score',
                                  hue = 'score',
                                  palette = ['#818A9f','#5d6883', '#27365a'],
                                  height =6)
percent_diff_grid.map(sns.barplot, 'item', 'last_minus_first')

percent_diff_grid.fig.suptitle('Difference in Scores From First to Last Test', y=1.05, color = '#FF0000', font='serif', fontsize='20', fontweight='bold')
percent_diff_grid.set_xlabels('Item', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
percent_diff_grid.set_ylabels('Percent Difference', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
percent_diff_grid.set_xticklabels(rotation=90);

#plt.savefig('../data/Difference_in_Scores_FirstvLast.png', bbox_inches='tight')

In [None]:
score0 = compare_percents.loc[compare_percents.score == 0]

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = score0, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('First Test to Last Test Score Difference by Percent for 0');

In [None]:
score1 = compare_percents.loc[compare_percents.score == 1]

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = score1, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('First Test to Last Test Score Difference by Percent for 1');

In [None]:
score2 = compare_percents.loc[compare_percents.score == 2]

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = score2, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('First Test to Last Test Score Difference by Percent for 2');

# no significant differences in results from cleaned and uncleaned data

# scores overview these need to be changed since removing less thatn 183 day child ids
### First tests with highest percent of 0 scores - sc_d3, cog_g6, fm_b5, cog_g2, cog_g5
### First tests with highest percent of 2 scores - sc_b1, cog_e2, sc_b2, cog_e4, fm_b4

### Last tests with highest percent of 0 scores- fm_b5, cog_g2, sc_d3, cog_g6, cog_g5 (diff order but same as first tests)
### Last tests with highest percent of 2 scores- sc_b1, sc_b2, cog_e2, cog_e4, cog_d2 (cog_d2 here instead of fm_b4 in first tests)

## Largest decrease in 0 scores (most improvement from 0) when comparing difference between first and last-
### sc_d1, sc_b2, sc_b1, cog_e4, cog_d2
## Smallest decrease in 0 scores(least improvement) when comparing difference between first and last - 
### fm_b5, cog_g2, cog_g1, cog_g5, cog_g6 (most of these are also the lowest to begin with)
## Largest increase in 2 scores(most improvement to 2) when comparing difference between first and last - 
### sc_b2, sc_b1, cog_e2, cog_e4, cog_d2 (most improved also usually start at higher scores to begin with as well)

In [None]:
score0.nlargest(5, 'percent_first')

In [None]:
score2.nlargest(5, 'percent_first')

In [None]:
score0.nlargest(5, 'percent_last')

In [None]:
score2.nlargest(5, 'percent_last')

In [None]:
score0.nsmallest(5, 'last_minus_first')

In [None]:
score0.nlargest(5, 'last_minus_first')

In [None]:
score2.nlargest(5, 'last_minus_first')

In [None]:
score2.nsmallest(5, 'last_minus_first')

## all scores by time in program

In [None]:
pd.set_option('display.max_columns', None)
clean_oc2.head()

In [None]:
time_in_program = clean_oc2.groupby('years_in_program')['fm_B4.0', 'fm_B5.0', 'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0', 'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0', 'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0'].mean()

In [None]:
time_in_program

### get avg scores by years in for first test, last test, find the difference

In [None]:
first_test_data.head()

In [None]:
time_in_first_test = first_test_data.groupby('years_in_program')['fm_B4.0', 'fm_B5.0', 'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0', 'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0', 'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0'].mean().reset_index()

In [None]:
time_in_first_test.head()

In [None]:
time_in_first_test = time_in_first_test.rename(columns = {'fm_B4.0': 'fm_B4', 'fm_B5.0': 'fm_B5',
       'cog_D2.0': 'cog_D2', 'cog_E2.0': 'cog_E2', 'cog_E4.0': 'cog_E4', 'cog_F1.0': 'cog_F1', 'cog_G1.0': 'cog_G1',
       'cog_G2.0': 'cog_G2', 'cog_G3.0': 'cog_G3', 'cog_G4.0': 'cog_G4', 'cog_G5.0': 'cog_G5', 'cog_G6.0': 'cog_G6',
       'sc_B1.0': 'sc_B1', 'sc_B2.0': 'sc_B2', 'sc_D1.0': 'sc_D1', 'sc_D2.0': 'sc_D2', 'sc_D3.0': 'sc_D3'})

In [None]:
time_in_first_test.dtypes

In [None]:
last_test_data.head()

In [None]:
time_in_last_test = last_test_data.groupby('years_in_program')['fm_B4.0', 'fm_B5.0', 'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0', 'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0', 'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0'].mean().reset_index()

In [None]:
time_in_last_test.head()

In [None]:
time_in_last_test = time_in_last_test.rename(columns = {'fm_B4.0': 'fm_B4', 'fm_B5.0': 'fm_B5',
       'cog_D2.0': 'cog_D2', 'cog_E2.0': 'cog_E2', 'cog_E4.0': 'cog_E4', 'cog_F1.0': 'cog_F1', 'cog_G1.0': 'cog_G1',
       'cog_G2.0': 'cog_G2', 'cog_G3.0': 'cog_G3', 'cog_G4.0': 'cog_G4', 'cog_G5.0': 'cog_G5', 'cog_G6.0': 'cog_G6',
       'sc_B1.0': 'sc_B1', 'sc_B2.0': 'sc_B2', 'sc_D1.0': 'sc_D1', 'sc_D2.0': 'sc_D2', 'sc_D3.0': 'sc_D3'})

In [None]:
time_in_first_melt = time_in_first_test.melt('years_in_program')

In [None]:
time_in_first_melt.head()

In [None]:
time_in_first_melt = time_in_first_melt.rename(columns = {'variable': 'item', 'value': 'first_avg_score'})

In [None]:
time_in_first_melt.head()

In [None]:
time_in_last_melt = time_in_last_test.melt('years_in_program')

In [None]:
time_in_last_melt.head()

In [None]:
time_in_last_melt = time_in_last_melt.rename(columns = {'variable': 'item', 'value': 'last_avg_score'})

In [None]:
time_in_last_melt.head()

In [None]:
compare_by_time_in = pd.merge(time_in_first_melt, time_in_last_melt, on = ['years_in_program', 'item'], how = 'outer')

In [None]:
compare_by_time_in.head()

In [None]:
compare_by_time_in['last_minus_first'] = compare_by_time_in['last_avg_score'] - compare_by_time_in['first_avg_score']

In [None]:
compare_by_time_in.head()

In [None]:
compare_by_time_in = compare_by_time_in.rename(columns = {'years_in_program': 'Years in Program'})

## as usual my FacetGrid is ugly, for now I will just break these out into individual graphs, but you can see that with more years in the program there is an increase in score improvement

In [None]:
# i dont know how to make this look right?
diff_by_time_grid = sns.FacetGrid(compare_by_time_in, col = 'Years in Program',
                                  col_wrap = 2,
                                  hue = 'Years in Program',
                                  palette = ['#C9CDD6', '#818AF9','#394768', '#27365a'],
                                  height =6)
diff_by_time_grid.map(sns.barplot, 'item', 'last_minus_first')

diff_by_time_grid.fig.suptitle('Average Score Improvements by Time Receiving Services', y=1.05, color = '#FF0000', font='serif', fontsize='20', fontweight='bold')
diff_by_time_grid.set_xlabels('Item', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
diff_by_time_grid.set_ylabels('Difference in Average Score', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
diff_by_time_grid.set_xticklabels(rotation=90);

#plt.savefig('../data/diff_btwn_years_in_program.png',  bbox_inches='tight')


In [None]:
# i dont know how to make this look right?
diff_by_time_grid = sns.FacetGrid(compare_by_time_in, col = 'Years in Program',
                                  hue = 'Years in Program',
                                  palette = ['#b7bcc8', '#818A9f','#5d6883', '#27365a'],
                                  height =6)
diff_by_time_grid.map(sns.barplot, 'item', 'last_minus_first')

diff_by_time_grid.fig.suptitle('Average Score Improvements by Time Receiving Services', y=1.05, color = '#FF0000', font='serif', fontsize='20', fontweight='bold')
diff_by_time_grid.set_xlabels('Item', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
diff_by_time_grid.set_ylabels('Difference in Average Score', color = '#FF0000', font='serif', fontsize='14', fontweight='bold')
diff_by_time_grid.set_xticklabels(rotation=90);

#plt.savefig('../data/diff_btwn_years_in_program.png',  bbox_inches='tight')

In [None]:
time_to_1 = compare_by_time_in.loc[compare_by_time_in['Years in Program'] == '0.5-1']

In [None]:
time_to_1.head()

In [None]:
time_to_1p5 = compare_by_time_in.loc[compare_by_time_in['Years in Program'] == '1-1.5']

In [None]:
time_to_1p5.head()

In [None]:
time_to_2 = compare_by_time_in.loc[compare_by_time_in['Years in Program'] == '1.5-2']

In [None]:
time_to_2.head()

In [None]:
time_to_2p7 = compare_by_time_in.loc[compare_by_time_in['Years in Program'] == '2-2.7']

In [None]:
time_to_2p7.head()

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = time_to_1, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('Average Score Improvements 0.5-1 Year in Program');
plt.xlabel('Item')
plt.ylabel('Average Difference In Score')

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = time_to_1p5, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('Average Score Improvements 1-1.5 Years in Program');
plt.xlabel('Item')
plt.ylabel('Average Difference In Score')

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = time_to_2, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('Average Score Improvements 1.5-2 Year in Program');
plt.xlabel('Item')
plt.ylabel('Average Difference In Score')

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(data = time_to_2p7, 
           x = 'item', 
           y = 'last_minus_first')

plt.xticks(rotation = 90)
plt.title('Average Score Improvements 2-2.7 Year in Program');
plt.xlabel('Item')
plt.ylabel('Average Difference In Score')

## items with least improvement: fm_b5, cog_g2, cog_g1, cog_g4

In [None]:
time_to_1.nsmallest(5, 'last_minus_first')

In [None]:
time_to_1p5.nsmallest(5, 'last_minus_first')

In [None]:
time_to_2.nsmallest(5, 'last_minus_first')

In [None]:
time_to_2p7.nsmallest(5, 'last_minus_first')

## items with most improvement - sc_b2, sc_b1, sc_d1, cog_e2, 

In [None]:
time_to_1.nlargest(5, 'last_minus_first')

In [None]:
time_to_1p5.nlargest(5, 'last_minus_first')

In [None]:
time_to_2.nlargest(5, 'last_minus_first')

In [None]:
time_to_2p7.nlargest(5, 'last_minus_first')