In [237]:
import pandas as pd
%matplotlib inline
import pylab as plt
import numpy as np
import scipy as sc
import scipy.stats

# High school test scores data

Since our correlation metrics -- as predicted -- did not show high enough significance between the GPA and the yield, we decided to add new features to our datasets.

Fortunately, we have found a dataset with California SAT, ACT and AP test scores by year and high school: https://www.cde.ca.gov/ds/sp/ai/. The dataset consists of multiple `.xls` files grouped by:
 - year
 - test
 
Moreover, the `.xls` files have different formats. Examples of differences include:
 - extra/missing columns
 - number of rows before the header
 - different column names
 
We prepared the following functions to resolve these differences:

In [238]:
def read_multi_xls(prefix, start_year, end_year, skip_row_count=0):
    data_frames = []
    for year in range(start_year, end_year+1):
        year_2dig = year % 100
        df = pd.read_excel(prefix + str(year_2dig).zfill(2) + '.xls',
                skiprows=list(range(skip_row_count)))
        df['year'] = year
        data_frames.append(df)
    return data_frames

def import_multi_xls(prefix, start_year, end_year, skip_row_count, columns_to_remove, must_have_columns=[]):
    data_frames = []
    column_names = None
    for year in range(start_year, end_year+1):
        year_2dig = year % 100
        df = pd.read_excel(prefix + str(year_2dig).zfill(2) + '.xls',
                skiprows=list(range(skip_row_count[year])))
        for c in columns_to_remove[year]:
            del df[c]
        
        # There are differences between column names, so we use the names from the first
        # dataframe
        if column_names is None:
            column_names = list(df)
        
        for col in must_have_columns:
            if col not in list(df):
                df.insert(column_names.index(col), col, np.nan)
                
        new_column_names = list(df)
        
        renamer = {}
        for i in range(len(column_names)):
            renamer[new_column_names[i]] = column_names[i]
        df.rename(columns=renamer, inplace=True)
        
        df['year'] = year
        
        data_frames.append(df)
    return pd.concat(data_frames)

## AP Scores

The first set of scores we will import is the AP scores. It turns out that the data between 1999 and 2013 has a similar format, and we should be able to perform a single `import_multi_xls()` call for all of it.

In [239]:
ap_99_13 = import_multi_xls('data/test_scores/ap/ap', 1999, 2013, {
        1999: 2, 2000: 2, 2001: 2, 2002: 2, 2003: 2, 2004: 2, 2005: 2, 2006: 2, 2007: 2, 2008: 2,
        2009: 3, 2010: 3, 2011: 3, 2012: 3, 2013: 3,
    }, {
        1999: [], 2000: [], 2001: [], 2002: [], 2003: [],
        2004: ['Rate of Exams\nWith a Score of\n3 or Greater\nFor 12th Grade\nStudents', 'Rate of Exams\nWith a Score of\n3 or Greater\nFor 11th & 12th\nGrade Students'],
        2005: [], 2006: [], 2007: [], 2008: [], 2009: [], 2010: ['Year'], 2011: [], 2012: [], 2013: [],
    },
    ['Total\nNumber of\nExams Taken', 'Number\nof Exams\nWith a Score of\n3 or Greater']
)
ap_99_13.head()

Unnamed: 0,County Number,District Number,School Number,County Name,District Name,School Name,Grade 11 Enrollment (October 1998 CBEDS),Grade 12 Enrollment (October 1998 CBEDS),Grade 11+12 Enrollment (October 1998 CBEDS),Number of AP Exam Takers,Total Number of Exams Taken,Number of Exams With a Score of 3 or Greater,Number of Exams With a Score of 1,Number of Exams With a Score of 2,Number of Exams With a Score of 3,Number of Exams With a Score of 4,Number of Exams With a Score of 5,year
0,1,10017,130401,Alameda,Alameda Co. Office Of Educatio,Juvenile Hall/Court,157.0,224.0,381.0,0.0,0,0,0,0,0,0,0,1999
1,1,10017,130419,Alameda,Alameda Co. Office Of Educatio,County Community,3.0,8.0,11.0,0.0,0,0,0,0,0,0,0,1999
2,1,10017,130427,Alameda,Alameda Co. Office Of Educatio,Alternative/Opportunity,35.0,67.0,102.0,0.0,0,0,0,0,0,0,0,1999
3,1,61119,130229,Alameda,Alameda City Unified,Alameda High,413.0,372.0,785.0,165.0,272,183,35,54,95,54,34,1999
4,1,61119,132878,Alameda,Alameda City Unified,Encinal High,350.0,240.0,590.0,138.0,267,77,86,104,53,22,2,1999


The data from 2014 to 2016 is formatted differently, and we will need to import it separately:

In [240]:
ap_14_16_dfs = read_multi_xls('data/test_scores/ap/ap', 2014, 2016)
for df in ap_14_16_dfs:
    # There are some lower/upper case differences in column names which causes problems
    # while merging.
    df.columns = map(str.lower, df.columns)
ap_14_16 = pd.concat(ap_14_16_dfs)
ap_14_16.head()

Unnamed: 0,cds,cname,dname,enroll1012,enroll12,numscr1,numscr2,numscr3,numscr4,numscr5,numtsttakr,rtype,sname,year
0,1612000133397,Alameda,Livermore Valley Joint Unified ...,1476,467,38.0,100.0,220.0,197.0,115.0,394,S,Granada High ...,2014
1,1612000134536,Alameda,Livermore Valley Joint Unified ...,1294,438,37.0,87.0,156.0,98.0,62.0,255,S,Livermore High ...,2014
2,1612340000000,Alameda,Newark Unified ...,1539,491,117.0,181.0,172.0,103.0,63.0,324,D,...,2014
3,1612340130054,Alameda,Newark Unified ...,1414,418,117.0,181.0,172.0,103.0,63.0,324,S,Newark Memorial High ...,2014
4,1612340130484,Alameda,Newark Unified ...,33,14,,,,,,0,S,Crossroads High (Alternative) ...,2014


We will also extract the school number from the CDS number:

In [241]:
ap_14_16['school_num'] = pd.to_numeric(ap_14_16['cds'].astype(str).str[-6:])
ap_14_16.head()

Unnamed: 0,cds,cname,dname,enroll1012,enroll12,numscr1,numscr2,numscr3,numscr4,numscr5,numtsttakr,rtype,sname,year,school_num
0,1612000133397,Alameda,Livermore Valley Joint Unified ...,1476,467,38.0,100.0,220.0,197.0,115.0,394,S,Granada High ...,2014,133397
1,1612000134536,Alameda,Livermore Valley Joint Unified ...,1294,438,37.0,87.0,156.0,98.0,62.0,255,S,Livermore High ...,2014,134536
2,1612340000000,Alameda,Newark Unified ...,1539,491,117.0,181.0,172.0,103.0,63.0,324,D,...,2014,0
3,1612340130054,Alameda,Newark Unified ...,1414,418,117.0,181.0,172.0,103.0,63.0,324,S,Newark Memorial High ...,2014,130054
4,1612340130484,Alameda,Newark Unified ...,33,14,,,,,,0,S,Crossroads High (Alternative) ...,2014,130484


Also, we want to drop the cumulative data:

In [242]:
ap_14_16 = ap_14_16[ap_14_16['rtype'] == 'S']
ap_14_16.head()

Unnamed: 0,cds,cname,dname,enroll1012,enroll12,numscr1,numscr2,numscr3,numscr4,numscr5,numtsttakr,rtype,sname,year,school_num
0,1612000133397,Alameda,Livermore Valley Joint Unified ...,1476,467,38.0,100.0,220.0,197.0,115.0,394,S,Granada High ...,2014,133397
1,1612000134536,Alameda,Livermore Valley Joint Unified ...,1294,438,37.0,87.0,156.0,98.0,62.0,255,S,Livermore High ...,2014,134536
3,1612340130054,Alameda,Newark Unified ...,1414,418,117.0,181.0,172.0,103.0,63.0,324,S,Newark Memorial High ...,2014,130054
4,1612340130484,Alameda,Newark Unified ...,33,14,,,,,,0,S,Crossroads High (Alternative) ...,2014,130484
6,1612420126763,Alameda,New Haven Unified ...,125,73,,,,,,0,S,Decoto School for Independent Study ...,2014,126763


Next, we will rename and drop some of the columns in our dataframes.

In [243]:
ap_14_16.drop(columns=[
    'cds', 'cname', 'dname', 'rtype', 'sname', 'enroll1012'
], inplace=True)
ap_99_13.drop(columns=[
    'County\nNumber', 'District\nNumber', 'County Name', 'District Name', 'School Name',
    'Total\nNumber of\nExams Taken', 'Number\nof Exams\nWith a Score of\n3 or Greater',
    'Grade 11\nEnrollment\n(October 1998\nCBEDS)',
    'Grade 11+12\nEnrollment\n(October 1998\nCBEDS)',
], inplace=True)

ap_99_13_renamer = {
    'School\nNumber': 'school_num',
    'Number of\nAP Exam\nTakers': 'numtsttakr',
    'Number\nof Exams\nWith a Score of\n1': 'numscr1',
    'Number\nof Exams\nWith a Score of\n2': 'numscr2',
    'Number\nof Exams\nWith a Score of\n3': 'numscr3',
    'Number\nof Exams\nWith a Score of\n4': 'numscr4',
    'Number\nof Exams\nWith a Score of\n5': 'numscr5',
    'Grade 12\nEnrollment\n(October 1998\nCBEDS)': 'enroll12',
    'year': 'year',
}
ap_99_13.rename(columns=ap_99_13_renamer, inplace=True)

Finally, we merge the datasets

In [244]:
ap_scores = pd.concat([ap_99_13, ap_14_16])

All of our columns should have numeric values at this point. We will make pandas convert all values to numeric (and all non-number values to NaN)

In [245]:
ap_scores = ap_scores.apply(pd.to_numeric, errors='coerce', axis=1)
ap_scores

Unnamed: 0,enroll12,numscr1,numscr2,numscr3,numscr4,numscr5,numtsttakr,school_num,year
0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,130401.0,1999.0
1,8.0,0.0,0.0,0.0,0.0,0.0,0.0,130419.0,1999.0
2,67.0,0.0,0.0,0.0,0.0,0.0,0.0,130427.0,1999.0
3,372.0,35.0,54.0,95.0,54.0,34.0,165.0,130229.0,1999.0
4,240.0,86.0,104.0,53.0,22.0,2.0,138.0,132878.0,1999.0
5,119.0,0.0,0.0,0.0,0.0,0.0,0.0,134304.0,1999.0
6,23.0,0.0,0.0,0.0,0.0,0.0,0.0,130294.0,1999.0
7,184.0,12.0,22.0,39.0,25.0,30.0,89.0,130450.0,1999.0
8,687.0,25.0,46.0,90.0,130.0,223.0,325.0,131177.0,1999.0
9,44.0,0.0,0.0,0.0,0.0,0.0,0.0,134924.0,1999.0


We are saving the processed data to a CSV file. This way we will not have to redo all of our computations.

In [247]:
ap_scores.to_csv('data/test_scores/ap/processed.csv', sep=',', index=False)

## SAT scores

Similarly to the AP scores, SAT scores are also split by year, and the .xls files have different formats. We will need to perform merging similar to the one in the *AP scores* section. 