In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

**Data has been partially cleaned in Excel:**
- Removed blank 'Code X' columns
- Combined testing date columns into one, as all testing occurs at the same time.
- Removed 3 rows with no scoring data (2 are likely duplicates anyway)

In [2]:
evals_raw= pd.DataFrame(pd.read_excel('../data/bdi3_all_evals_clean.xlsx'))
evals_raw.head()

Unnamed: 0,Child ID,Gender,Date of Birth,Location - Sub Level 1,Program Label,Date of Testing,Adaptive Sum of Scaled Scores,Adaptive Developmental Quotient,Adaptive Percentile Rank,Adaptive 95% Confidence Interval,...,Cognitive-Perception and Concepts RS,Cognitive-Perception and Concepts SS,Cognitive-Perception and Concepts PR,Cognitive-Perception and Concepts AE,Cognitive-Perception and Concepts RDI,Cognitive-Perception and Concepts CSS,Cognitive-Perception and Concepts CSS 90%,Cognitive-Perception and Concepts Z-Score,Cognitive-Perception and Concepts T-Score,Cognitive-Perception and Concepts NCE
0,44879,F,2022-09-27,Southwest,BDI-3 Eligibility Evaluation,2023-06-23,7.0,85.0,16,77-97,...,9.0,13.0,84,12,98/90,411.0,392 - 431,1,60.0,71
1,47795,F,2020-12-23,Greater Nashville,BDI-3 Eligibility Evaluation,2023-03-21,16.0,88.0,21,82-96,...,11.0,7.0,16,18,29/90,437.0,421 - 453,– 1.00,40.0,29
2,54340,M,2021-06-22,First Tennessee,BDI-3 Eligibility Evaluation,2023-04-06,8.0,90.0,25,81-101,...,10.0,7.0,16,15,37/90,426.0,408 - 444,– 1.00,40.0,29
3,54344,F,2022-03-07,Southeast Tennessee,BDI-3 Eligibility Evaluation,2023-02-23,13.0,115.0,84,103-123,...,9.0,10.0,50,12,92/90,411.0,392 - 431,0,50.0,50
4,54515,M,2021-11-04,Memphis Delta,BDI-3 Eligibility Evaluation,2023-02-24,5.0,75.0,5,68-88,...,10.0,10.0,50,15,88/90,426.0,408 - 444,0,50.0,50


In [8]:
evals_raw['is_duplicate'] = evals_raw.duplicated(subset=['Child ID', 'Date of Testing'])
evals = evals_raw[evals_raw['is_duplicate']==False]
evals = evals.drop('is_duplicate', axis=1)
evals.info()
evals.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 19435 entries, 0 to 23943
Columns: 197 entries, Child ID to Cognitive-Perception and Concepts NCE
dtypes: datetime64[ns](2), float64(69), object(126)
memory usage: 29.4+ MB


Unnamed: 0,Date of Birth,Date of Testing,Adaptive Sum of Scaled Scores,Adaptive Developmental Quotient,Adaptive T-Score,Social-Emotional Sum of Scaled Scores,Social-Emotional Developmental Quotient,Social-Emotional T-Score,Communication Sum of Scaled Scores,Communication Developmental Quotient,...,Cognitive-Attention and Memory CSS,Cognitive-Attention and Memory T-Score,Cognitive-Reasoning / Academic Skills RS,Cognitive-Reasoning / Academic Skills SS,Cognitive-Reasoning / Academic Skills CSS,Cognitive-Reasoning / Academic Skills T-Score,Cognitive-Perception and Concepts RS,Cognitive-Perception and Concepts SS,Cognitive-Perception and Concepts CSS,Cognitive-Perception and Concepts T-Score
count,19435,19435,19430.0,19430.0,19430.0,19420.0,19420.0,19420.0,19410.0,19410.0,...,19423.0,19423.0,9671.0,9671.0,9671.0,9671.0,19422.0,19422.0,19422.0,19422.0
mean,2021-08-25 17:37:18.538718976,2023-06-28 20:48:37.046565376,11.413639,87.425888,41.618682,22.381874,94.463131,46.309784,13.197888,79.593663,...,461.912269,44.704577,8.94406,8.098749,483.236067,43.631579,10.342344,7.785347,424.2301,42.620379
min,2019-10-15 00:00:00,2022-12-08 00:00:00,1.0,46.0,14.0,2.0,46.0,14.0,2.0,46.0,...,366.0,20.0,0.0,1.0,442.0,20.0,0.0,1.0,283.0,20.0
25%,2020-12-11 00:00:00,2023-04-21 00:00:00,7.0,76.0,34.0,17.0,84.0,39.0,8.0,64.0,...,452.0,40.0,6.0,6.0,475.0,37.0,8.0,6.0,395.0,37.0
50%,2021-07-21 00:00:00,2023-07-03 00:00:00,11.0,88.0,42.0,22.0,96.0,47.0,13.0,79.0,...,472.0,47.0,9.0,8.0,485.0,43.0,10.0,8.0,426.0,43.0
75%,2022-04-12 00:00:00,2023-09-08 12:00:00,15.0,100.0,50.0,28.0,106.0,54.0,18.0,94.0,...,482.0,50.0,12.0,11.0,493.0,53.0,12.0,10.0,451.0,50.0
max,2023-10-30 00:00:00,2023-11-15 00:00:00,36.0,148.0,82.0,53.0,154.0,86.0,38.0,154.0,...,544.0,80.0,30.0,19.0,543.0,80.0,48.0,19.0,554.0,80.0
std,,,5.821864,17.435036,11.610373,7.519883,16.702164,11.137009,6.723639,20.170917,...,32.561691,9.053539,4.207699,3.204358,14.153327,10.659079,4.227655,3.261733,43.760026,10.882605


There are 71 of 197 columns formatted in a way that allows for calculations.<br>Not all columns of scoring data are cleanly formatted; need to convert form type 'object' to a number format.

In [9]:
evals['Adaptive Percentile Rank']

0        16
1        21
2        25
3        84
4         5
         ..
23939    84
23940    84
23941    58
23942    21
23943    63
Name: Adaptive Percentile Rank, Length: 19435, dtype: object

In [4]:
evals.value_counts('Program Label')

Program Label
BDI-3 Eligibility Evaluation          12344
BDI-3 Annual Evaluation                6888
BDI-3 Milestone or Exit Evaluation      203
Name: count, dtype: int64

In [5]:
evals.value_counts('Location - Sub Level 1')

Location - Sub Level 1
Greater Nashville      3405
East Tennessee         3337
Memphis Delta          2712
South Central          2555
Upper Cumberland       2018
First Tennessee        1689
Southeast Tennessee    1579
Southwest              1099
Northwest               954
Name: count, dtype: int64

In [6]:
evals.value_counts('Gender')

Gender
M    11921
F     7223
U      291
Name: count, dtype: int64