In [None]:
import pandas as pd
import numpy as np
import re
import datetime

In [None]:
data = pd.read_csv('../data/all_evals.csv')
data.head()

Remove code columns

In [None]:
code_cols = []
pattern = re.compile('Code')

for col in data.columns:
    if re.search(pattern,col):
        code_cols.append(col)
#end

data = data.drop(columns = code_cols)
data.head()

Get rid of duplicate rows

In [None]:
data = data.loc[~data.duplicated()]

In [None]:
data = data.reset_index(drop = True)

Clean up RDI columns

In [None]:
RDI = re.compile('RDI') # r'RDI'
pattern = r'\d{1,2}\/90'
#new_vals = []
#new_cols = {}

for col in data.columns:
    if re.search(RDI,col):
        print('NEW COLUMN:',col)
        ind = 0
        
        for value in data[col]:
            print('Compare',value,'to',data[col][ind])
            
            if re.match(pattern,str(value)):
                new_val = str(value)[:-3]
                data.loc[ind,col] = int(new_val)
                print(ind, 'Match', new_val)
                
            elif (str(value)[0] == '<') or (str(value)[0] == '>'):
                new_val = str(value)[1:-3]
                data.loc[ind,col] = int(new_val)
                print(ind, '<>', new_val)
                
            elif type(value) == float:
                new_vals.append(np.nan)
                print(ind, 'NaN', new_val)
                
            else:
                date = datetime.datetime.strptime(value, '%b-%y')
                new_val = str(date.month)
                data.loc[ind,col] = int(new_val)
                print(ind, 'date', new_val)
            #end
            
            ind += 1
                
        new_cols[col] = new_vals
#end

In [None]:
data.head(20)

Clean up other columns

In [None]:
data3 = data.copy()

In [None]:
col_patterns = ['Percentile', 'PR', 'AE', 'NCE']

#j = 0
for word in col_patterns:
    pattern = re.compile(word)

    for col in data.columns:
        if re.search(pattern,col):
            print('NEW COLUMN:',col)
            ind = 0

            for value in data[col]:
                #print('j is:',j)
                print('Compare',value,'to',data[col][ind])
                
                try:
                    if (str(value)[0] == '<') or (str(value)[0] == '>'):
                        print('Start: <>')
                        new_val = str(value)[1:]
                        data.loc[ind,col] = float(new_val)
                        print(ind, '<>', value, '=>',new_value)
                
                    elif value == '':
                        print('Start: Blank')
                        new_val = np.nan
                        data.loc[ind,col] = new_val
                        print(ind, 'Blank', value, '=>',new_val)
                        
                    elif np.isnan(float(value)):
                        print('Start: Nan (try)')
                        print(ind, 'NaN', value)

                    else:
                        print('Start: Number')
                        new_val = float(value)
                        data.loc[ind,col] = new_val
                        print(ind, 'Number', value, '=>',new_val)
                    
                    ind += 1
                        
                except:
                    if type(value) == str:
                        print(ind, 'String', value)
                        ind += 1
                        continue
                    
                    else:
                        print('Start: NaN (except)')
                        data.loc[ind,col] = np.nan
                        print(ind, 'NaN', value)
                    
                    ind += 1

                #j += 1
                #if j >20:
                #    break
            break



#end

In [None]:
float('<20')

In [None]:
data.head(30)

In [None]:
data.loc[0,'Adaptive Percentile Rank']

### Data is now clean!

Next, let's create sub-dataframes based on what domain the column is in.

In [None]:
list(data.columns)

In [None]:
def slice_data(data, domain):
    
    pattern = re.compile(domain)
    
    new_cols = []
    
    for col in data.columns:
        if re.search(pattern,col):
            new_cols.append(col)
    #end
    
    new_df = data[new_cols]
    
    return new_df

In [None]:
domains = ['Adaptive','Social','Communication','Motor','Cognitive','Total']
df_dict = {}

for domain in domains:
    new_df = slice_data(data,domain)
    df_dict[domain] = new_df
#end

print(df_dict.keys())

In [None]:
adapt = df_dict['Adaptive']
soc   = df_dict['Social']
comm  = df_dict['Communication']
motor = df_dict['Motor']
cogn  = df_dict['Cognitive']
total = df_dict['Total']

We have now sliced the data based on domain! Now let's tackle the questions.

## 1. In which domains (and sub-domains) are children performing highest and lowest?

In [None]:
adapt

In [None]:
adapt_0 = adapt.fillna(0)
adapt_0

In [None]:
list(adapt.columns)

In [None]:
adapt_0.describe()

In [None]:
adapt.describe()

Replacing nulls with 0's impacts the last few columns the most, skewing the average. As such, I won't be replacing the null values for the rest of the dataframes.

Also, several columns are missing from the describe table. This is because the rows datatypes are objects, not numeric. So I have to fix those columns before we move on.

In [None]:
adapt2 = adapt.drop(columns = ['Adaptive 95% Confidence Interval', 'Adaptive-Self Care CSS 90%', 'Adaptive-Personal Responsibility CSS 90%'])
adapt2

In [None]:
ind=0
for value in adapt2['Adaptive RDI']:
    print(ind,value,'\t',str(value)[:-3])
    adapt2.loc[ind,'Adaptive RDI'] = str(value)[:-3]
    ind += 1

In [None]:
adapt2

In [None]:
adapt2 = adapt2.replace('<0.1','0.1').replace('<1','1').replace('>99','99').replace('',0)
adapt2

In [None]:
adapt2.astype(float)

In [None]:
'123456789'[:-3]

This has become tedious to drop the columns I don't want. I'm going to define a nenw function that will only give me columns I want.

In [None]:
def slice_numeric_data(data, domain):
    date    = re.compile('Date')
    percent = re.compile('%')
    examine = re.compile('Examiner')
    
    pattern = re.compile(domain)
    
    new_cols = []
    
    for col in data.columns:
        if (re.search(pattern,col)):
            if (not re.search(date,col)) & (not re.search(percent,col)) & (not re.search(examine,col)):
                new_cols.append(col)
    #end
    
    new_df = data[new_cols]
    
    return new_df

In [None]:
adapt3 = slice_numeric_data(data, 'Adaptive')
adapt3.head(20)

In [None]:
adapt3.astype(float)

In [None]:
df_dict = {}

for domain in domains:
    new_df = slice_numeric_data(data,domain)
    df_dict[domain] = new_df
#end

print(df_dict.keys())

In [None]:
adapt = df_dict['Adaptive']
soc   = df_dict['Social']
comm  = df_dict['Communication']
motor = df_dict['Motor']
cogn  = df_dict['Cognitive']
total = df_dict['Total']

In [None]:
adapt

In [None]:
adapt.replace('<1','1').replace('>99','99').replace('<24','24')