# AEPS Examiner Analysis

##### This notebook uses two formatted and filtered DataFrames:
    a. The 'oc2_df' DataFrame contains items and corresponding columns relevant to Outcome B (aka Outcome 2.)
    b. The 'full_df' DataFrame contains all items and corresponding columns.
##### Using this data, an analysis is provided for the following query:
    "Are there any trends in the scoring of individual Outcome B test items with regards to the examiner column?"

### Read in the TEIS data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
original_full_df = pd.read_csv('../data/aeps_all_data.csv', low_memory=False)

In [None]:
original_oc2_df = pd.read_csv('../data/aeps_oc2_data.csv', dtype = {'Child ID':str})

### Prepare the DataFrames for analysis

In [None]:
# Create temporary DataFrames to use for manipulation
oc2_df = original_oc2_df[:]
full_df = original_full_df[:]

# Filter the DataFrame to contain examiners who have given 10 or more exams
# Note: This cut-off retains about 95% of the total exams given
top_examiners = full_df.Examiner.value_counts().to_frame().reset_index()
top_examiners.rename(columns={'index':'Examiner', 'Examiner':'Count'}, inplace=True)
top_examiners = top_examiners.loc[top_examiners['Count'] >= 10, :].reset_index(drop=True)

# Save the 'dropped' records for potential further analysis
bottom_examiners = top_examiners.loc[top_examiners['Count'] < 10].reset_index(drop=True)

# Reassign the filtered DataFrames 
oc2_df = oc2_df.loc[oc2_df['Examiner'].isin(list(top_examiners['Examiner'])), :]
full_df = full_df.loc[full_df['Examiner'].isin(list(top_examiners['Examiner'])), :]

### Perform the Analysis

**For each examiner, compare the overall trend of their average scores for Outcome B items to the overall trend of their average scores for all items.**

In [None]:
# Create a subset of each DataFrame to use only the values necessary for analysis
scores_oc2_df = oc2_df.loc[:, 'fm_B4.0':'sc_D3.0']
scores_oc2_df.insert(0, 'Examiner', oc2_df['Examiner'])
scores_oc2_df.reset_index(drop=True, inplace=True)

scores_full_df = full_df.loc[:, 'fm_A1.0':'soc_C2.2']
scores_full_df.insert(0, 'Examiner', full_df['Examiner'])
scores_full_df.reset_index(drop=True, inplace=True)

In [None]:
# Add a column containing the total score for each exam for outcome b items and all items, respectively
scores_oc2_df['sum_score'] = scores_oc2_df.loc[:, 'fm_B4.0':'sc_D3.0'].sum(axis=1)
scores_full_df['sum_score'] = scores_full_df.loc[:, 'fm_A1.0':'soc_C2.2'].sum(axis=1)

# Add a column showing each score's percentage of the total possible for outcome b items and all items, respectively
total_possible_oc2 = 17 * 2.0
total_possible_full = 249 * 2.0
scores_oc2_df['perc_total_score'] = (scores_oc2_df.loc[:, 'sum_score'] / total_possible_oc2 * 100).round()
scores_full_df['perc_total_score'] = (scores_full_df.loc[:, 'sum_score'] / total_possible_full * 100).round()

In [None]:
# Create a dataframe grouped by examiner and add a column showing total exams per examiner
# Add another column showing the percentage of total exams each examiner has done
oc2_groups = pd.DataFrame()
oc2_groups = scores_oc2_df.groupby('Examiner')['Examiner'].count().to_frame()
oc2_groups.insert(0, 'avg_score_perc', (scores_oc2_df.groupby('Examiner')['perc_total_score'].mean()).round().astype(int))
oc2_groups.columns = ['avg_score_perc', 'total_exams']
oc2_groups = oc2_groups.loc[:, 'avg_score_perc':].reset_index()

tot_exams = full_df.shape[0]
oc2_groups['perc_total_exams'] = (oc2_groups.loc[:, 'total_exams'] / tot_exams * 100.0).round(2)

oc2_groups.sort_values(by=['avg_score_perc', 'total_exams'], ascending=[True, False], inplace=True)


# Repeat for the dataset containing all item scores
full_groups = pd.DataFrame()
full_groups = scores_full_df.groupby('Examiner')['Examiner'].count().to_frame()
full_groups.insert(0, 'avg_score_perc', (scores_full_df.groupby('Examiner')['perc_total_score'].mean()).round().astype(int))
full_groups.columns = ['avg_score_perc', 'total_exams']
full_groups = full_groups.loc[:, 'avg_score_perc':].reset_index()

tot_exams = full_df.shape[0]
full_groups['perc_total_exams'] = (full_groups.loc[:, 'total_exams'] / tot_exams * 100.0).round(2)

full_groups.sort_values(by=['avg_score_perc', 'total_exams'], ascending=[True, False], inplace=True)

### Format and Display Correlation Graphs

In [None]:
# Declare the font styles
font_axes = {'family': 'serif', 'weight': 'bold', 'size': 12, 'color' : '#FF0000'}
font_title = {'family': 'serif', 'weight': 'bold', 'size': 14, 'color' : '#FF0000'}

In [None]:
# Create a scatter plot with a correlation line 
fig = sns.lmplot(data=oc2_groups, x='avg_score_perc', y='total_exams', aspect=1.5, ci=None, markers='d', line_kws={'color':'black', 'linestyle':'--', 'linewidth':0.7}, scatter_kws={'color':'#2DCCD3', 's':50, 'marker':'d', 'edgecolors':'white', 'linewidths':0.6})

# Add vertical gridlines behind the data points
plt.rc('axes', axisbelow=True)
plt.grid(axis='x', color='lightgray', alpha=0.2)

# Add a background color
fig.set(facecolor='#E8E8E8')

# Add axis and title labels
plt.title(label = 'Total Exams and Average Cumulative Score per Examiner\n(Outcome B Items)', fontdict=font_title)
plt.xlabel('Average Cumulative Score\n(% of Total Possible Score for Outcome B Items)', fontdict=font_axes)
plt.ylabel('Total Exams', fontdict=font_axes)

# Add a top and right border line
sns.despine(top=False, right=False)

# (Optional) Uncomment the line below to export the figure
#plt.savefig('../data/exams_ocb.png', bbox_inches="tight")

**Compare the overall distributions of Outcome B items to all items.**

In [None]:
oc2_data = (list(full_groups['avg_score_perc']), list(full_groups['total_exams']))
all_data = (list(oc2_groups['avg_score_perc']), list(oc2_groups['total_exams']))


data_tuple = (oc2_data, all_data)
colors = ('#737373', '#2DCCD3')
markers = ('o', 'd')
groups = ('All Items', 'Outcome B Items')
alphas = (1, 0.7)
edgecolors = ('black', 'white')

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, facecolor='1.0')

# Add a background color
ax.set(facecolor='#E8E8E8')


for data, clr, mrk, group, alph, ec in zip(data_tuple, colors, markers, groups, alphas, edgecolors):
    x, y = data
    ax.scatter(x, y, color = clr, marker = mrk, alpha = alph, edgecolors = ec, linewidths = 0.6, s = 50, label = group)

plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80])
plt.title(label = 'Total Exams and Average Scores per Examiner', fontdict=font_title)
plt.xlabel('Average Score\n(% of Total)', fontdict=font_axes)
plt.ylabel('Total Exams', fontdict=font_axes)
plt.legend(loc = 1)
plt.grid(True, axis='y', alpha=0.3)

# (Optional) Uncomment the line below to export the figure
#plt.savefig('../data/exams_corr.png', bbox_inches="tight")