# AEPS Examiner Analysis (Preparation)

##### This notebook formats and filters the TEID AEPS spreadsheet and churns out two DataFrames:
    a. The 'oc2_df' DataFrame contains items and corresponding columns relevant to Outcome B (aka Outcome 2.)
       This DataFrame is exported as 'aeps_oc2_data.csv'
    b. The 'df' DataFrame contains all items and corresponding columns.
       This DataFrame is exported as 'aeps_all_data.csv'

### Read in the TEIS Data

In [None]:
import pandas as pd
import string
import fuzzywuzzy
from fuzzywuzzy import fuzz
import re

In [None]:
aeps_original_df = pd.read_excel('../data/AEPSi Data for NSS.xlsx', sheet_name='AEPS 2017-2022')

### Prepare the Data

#### Filter and Clean

In [None]:
# Create a temporary copy of the dataframe to use for manipulation
df = aeps_original_df

In [None]:
# Drop a couple rows with unusable data
df = df.drop([1393, 17927]).reset_index(drop = True)

# Strip strings in the dataframe
df = df.applymap(lambda x: x.strip() if isinstance(x, type(str)) else x)

# Convert datatype of Child ID to be string for all rows.
df = df.astype({'Child ID':'string'})

# Remove letters from Child ID
df['Child ID'] = df['Child ID'].str.replace('\t', '')
df['Child ID'] = df['Child ID'].str.replace('i', '')

#### Prepare the 'Examiner' column for analysis

In [None]:
# As per Chris:
#First, an easy adjustment, stripping any leading or lagging spaces
df.Examiner = df.Examiner.str.strip()

#Now we'll look for any cells that contain a semicolon and keep only what is to the left of the semicolon
df.Examiner = df.Examiner.str.extract('(^[^;]+)')

#Making the value counts into a dataframe
ex_counts = df.Examiner.value_counts().to_frame()
#Keeping only those entries that occur more than once
ex_counts = ex_counts.loc[ex_counts.Examiner > 1]
#Turning that into a list
Examiner_list =ex_counts.index.to_list()
Examiner_list

#creating a column to indicate whether or not a name is changed (Y=Yes, N=No)
df['changed'] = 'N'

#Building a for loop that compares the entries in the examiner column to the entries in our list
for name in Examiner_list:
    
        for ind, row in df.iterrows():
            #For any row that indicates it has NOT be changed, the following happens
            if row.changed == 'N':
                #Create a ration of how similar the entry in the dataframe is to the entry in the list
                ratio = fuzz.token_set_ratio(name, row.Examiner)
                #For any entries that have a ratio of 100, meaning they match exactly...
                if ratio == 100:
                    #Go ahead and mark the 'changed' column as Y so that row won't be looked at again
                    df.loc[ind, 'changed'] = 'Y'
                #For  any entries that are very similar (ratio > 88) but not exactly the same...
                if ((ratio > 88)  & (ratio < 100)):
                    #go ahead and change the entry in the dataframe to match the entry in the list
                    df.loc[ind, 'Examiner'] = name
                    #and mark the 'changed' column as Y so that row won't be evaluated again
                    df.loc[ind, 'changed'] = 'Y'
#This loop continues until all rows are changed or have no match that creates a ratio greater than 88            

#Get rid of our 'changed' column now that it's served its purpose
df = df.drop(columns = 'changed')

In [None]:
# Remove lingering numbers and whitespace
df['Examiner'] = df['Examiner'].str.replace('  ', ' ').str.replace('1', '').str.replace('2', '')
df['Examiner'] = df['Examiner'].str.strip(string.digits)

# Remove punctuation marks from the beginning and end of each cell
df['Examiner'] = df['Examiner'].str.strip(string.punctuation)

# Format all names to have consistent capitalization
df['Examiner'] = df['Examiner'].str.title()

In [None]:
# Subset the dataframe to include only Outcome B items and their relevant data fields
oc2_df = df[['Child ID', 'Program Name', 'AEPSi ID', 'DOB', 'Gender', 'Dev Status', 'Test Date', 'Examiner', 'Service Coordinator', 'TEIS Point of Entry Office (POE)', 'ESL', 'County of Residence', 'Number of Items', 
             'fm_B4.0', 'fm_B5.0', 'cog_D2.0', 'cog_E2.0', 'cog_E4.0', 'cog_F1.0', 'cog_G1.0', 'cog_G2.0', 'cog_G3.0', 'cog_G4.0', 'cog_G5.0', 'cog_G6.0', 'sc_B1.0', 'sc_B2.0', 'sc_D1.0', 'sc_D2.0', 'sc_D3.0', 
             'FM Raw Score', 'FM Possible Score', 'FM Percentage', 
             'GM Raw Score', 'GM Possible Score', 'GM Percentage', 
             'Adapt Raw Score', 'Adapt Possible Score', 'Adapt Percentage', 
             'Cog Raw Score', 'Cog Possible Score', 'Cog Percentage',
             'SC Raw Score', 'SC Possible Score', 'SC Percentage', 
             'Soc Raw Score', 'Soc Possible Score', 'Soc Percentage', 
             'Overall Raw Score', 'Overall Possible Score', 'Overall Percentage', 
             'FM Goal Score', 'FM Cutoff', 'FM Result', 
             'GM Goal Score', 'GM Cutoff', 'GM Result', 
             'Adapt Goal Score', 'Adapt Cutoff', 'Adapt Result', 
             'Cog Goal Score', 'Cog Cutoff', 'Cog Result', 
             'SC Goal Score', 'SC Cutoff', 'SC Result', 
             'Soc Goal Score', 'Soc Cutoff', 'Soc Result']]

In [None]:
# Drop a few columns from the main DataFrame that are not used in analysis
df = df.drop(columns=['AssessID', 'State', 'AEPS Level', 'Chron Age', 'Child First', 'Child Last'])

#### Save the processed dataframes for future load-in

In [None]:
df.to_csv('../data/aeps_all_data.csv')

In [None]:
oc2_df.to_csv('../data/aeps_oc2_data.csv')