In [1]:
import pathlib
import zipfile

import numpy as np
import pandas as pd

In [2]:
cwd = pathlib.Path.cwd()
data_path = cwd / 'data'
csv_path = data_path / 'csv'
zip_path = data_path / 'zip'

if not zip_path.exists():
    raise FileNotFoundError(f"'{zip_path}' does not exist")

pathlib.Path.mkdir(csv_path, exist_ok=True)

In [3]:
CDS_CODE = ['County Code', 'District Code', 'School Code']

def read_zip(zip_file):
    with zipfile.ZipFile(zip_file) as zf:
        results_file, entities_file = zf.filelist
        results_csv = zf.extract(results_file, csv_path)
        entities_csv = zf.extract(entities_file, csv_path)
    
    results_df = (pd.read_csv(results_csv)
                    .set_index(CDS_CODE)
                    .drop(columns=['Test Type', 'Filler'])      #: All 'Test Type' values were == 'B'
                    .replace('*', np.nan))

    results_df['Mean Scale Score'] = results_df['Mean Scale Score'].map(float)
    pcts = [col for col in results_df.columns if 'Percent' in col]
    results_df.loc[:,pcts] = results_df.loc[:,pcts].applymap(lambda x: float(x) / 100)

    entities_df = pd.read_csv(entities_csv, encoding='latin-1').set_index(CDS_CODE)
    return results_df, entities_df

In [4]:
file_2015 = zip_path / 'sb_ca2015_1_csv_v3.zip'
file_2016 = zip_path / 'sb_ca2016_1_csv_v3.zip'
file_2017 = zip_path / 'sb_ca2017_1_csv_v2.zip'
file_2018 = zip_path / 'sb_ca2018_1_csv_v3.zip'

results_2015, entities_2015 = read_zip(file_2015)
results_2016, entities_2016 = read_zip(file_2016)
results_2017, entities_2017 = read_zip(file_2017)
results_2018, entities_2018 = read_zip(file_2018)

results_all = pd.concat([results_2015, results_2016, results_2017, results_2018], sort=True)
entities_all = entities_2015 + entities_2016 + entities_2017 + entities_2018