In [1]:
import gdal
from osgeo import ogr
from osgeo import osr
import numpy as np
import libpysal as ps
import spreg

In [2]:
def acaTest(file, subject, yr, dem, inds, dsn, dest):
    '''File is the path to the .dbf file you will be using. subject is the academic subject being studied. dem
    is the ethnic group in which performance in this subject is being studied. yr is the year in which the performance of
    this demographic in this subject is being studied as a numerical string. inds is the proper names of the independent 
    variables (code must be modified to allow for variables outside of the set initially collected, due to necessary 
    automation). dsn is the scope for which the dataset was prepared. dest is the desired folder for the results to 
    be output to.'''
    
    #the purpose of this tool is to run an OLS regression to explain the variation in math and reading scores in 2014 and
    #2015 in Tacoma elementary and middle schools, using pollution and selected demographic characteristics as independent
    #variables.
    
    db = ps.io.open(file, 'r')

    dep = subject[0:1]
    
    if yr == '2014':
        end = '14'
    elif yr == '2015':
        end = '15'
    else:
        print('invalid year')
    
    #please modify this part to suit your own data. ent is appropriate to the dependent variables of the original data set,
    #while demo is appropriate to the variable names of the independent variables in the original dataset.
    if str(dem) == 'overall':
        ent = '_over'
        demo = 'Over'
    else:
        print('Invalid demographic.')
        return
    
    dependent = dep + end + ent
    
    #based on https://spreg.readthedocs.io/en/latest/generated/spreg.OLS.html
    
    dv = db.by_col(dependent)
    
    y = np.array(dv)
    
    y.shape = (len(dv), 1)
    
    #this is another one which will require modification if you're working with different data, or data processed 
    #differently, from the american community survey data being utilized in the original project.

    x = []
    
    for i in inds:
        #estimated pollution values across the school
        if i == 'pollution':
            wx = demo + 'Pol' + end
            x.append(db.by_col(wx))
        #average household median income by blockgroup for each student in the school.
        elif i == 'income':
            wx = demo + 'Inc' + end
            x.append(db.by_col(wx))
        #average per-relevant-student population of a given block group
        elif i == 'population':
            wx = demo + 'Pop' + end
            x.append(db.by_col(wx))
        #average years of education per adult in the child's blockgroup of residence across the school's relevant students.
        elif i == 'schooling':
            wx = demo + 'Sch' + end
            x.append(db.by_col(wx))
        #from here on, all of these variables are originally decimal percentages expressing the average proportion of a
        #broad category of work in the child's block group, averaged across all relevant students.
        elif i == 'business':
            wx = demo + 'Bus' + end
            x.append(db.by_col(wx))
        elif i == 'science':
            wx = demo + 'Sci' + end
            x.append(db.by_col(wx))
        elif i == 'educationAndLegal':
            wx = demo + 'Edl' + end
            x.append(db.by_col(wx))
        elif i == 'healthcare':
            wx = demo + 'Hea' + end
            x.append(db.by_col(wx))
        elif i == 'service':
            wx = demo + 'Ser' + end
            x.append(db.by_col(wx))
        elif i == 'salesAndOffice':
            wx = demo + 'Sal' + end
            x.append(db.by_col(wx))
        elif i == 'extraction':
            wx = demo + 'Ext' + end
            x.append(db.by_col(wx))
        elif i == 'logistics':
            wx = demo + 'Log' + end
            x.append(db.by_col(wx))
        elif i == 'disadvantageA':
            wx = demo + 'DisA' + end
            x.append(db.by_col(wx))
        elif i == 'disadvantageB':
            wx = demo + 'DisB' + end
            x.append(db.by_col(wx))
        elif i == 'disadvantageC':
            wx = demo + 'DisC' + end
            x.append(db.by_col(wx))
            
    #this level of automation, and the inflexibility it brought, was necessary to handle the over one hundred variables
    #per record in the dataset this tool was developed for. Which was not how it eventually ended up being used.

    x = np.array(x).T
            
    ols = spreg.OLS(y, x, name_y=dependent, name_x=inds, name_ds=dsn, white_test='true')
    
    suma = ols.summary
    
    sumadest = dest + 'OLS_' + dependent + '_suma.txt'
    
    #https://www.pythontutorial.net/python-basics/python-create-text-file/
    with open(sumadest, 'w') as f:
        f.write(suma)

In [3]:
def testAll(file, subs, dems, yrs, inds, dsn, dest):
    '''Automatically runs acaTest for all subjects, demographics, and years provided, using the specified independent
    variables.
    
    subs is a list of all subjects to be tested.
    
    dems is a list of all ethnic groups to be tested.
    
    yrs is a list of all years, in numerical string form, to be tested.
    
    inds is a list of the names for all independent variables to be used.
    
    dsn is the geographic scope for which the dataset was prepared.
    
    dest is the desired destination.'''
    
    for i in subs:
        for u in yrs:
            for we in dems:
                acaTest(file, i, u, we, inds, dsn, dest)

In [18]:
testAll('other_data/polygons_points/Schools_with_Data/mid_fin_dis.dbf', ['reading', 'math'], ['overall'],
       ['2014', '2015'], ['pollution', 'disadvantageA'], 'tps_mid_stats',
       'olsresults/simpletest/middis_A')

In [19]:
testAll('other_data/polygons_points/Schools_with_Data/elem_fin_dis.dbf', ['reading', 'math'], ['overall'],
       ['2014', '2015'], ['pollution', 'disadvantageA'], 'tps_elem_stats',
       'olsresults/simpletest/elemdis_A')