# Clean Data

## Story
Show US average of <b>income, debt, net price</b> for schools over the years. Then, allow for users to see how their schools compare.

## Goal
* Clean three variables
* Find the averages for each
* Store data in JSON
    * (school_name, income, debt, net_price)
    * average will have school_name == 'avg'

In [18]:
%matplotlib inline

import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
import seaborn as sns

sns.set_style("white")

conn = sqlite3.connect('../data/output/database.sqlite')
c = conn.cursor()

def execute(sql):
    '''Executes a SQL command on the 'c' cursor and returns the results'''
    c.execute(sql)
    return c.fetchall()

def printByYear(data):
    '''Given a list of tuples with (year, data), prints the data next to corresponding year'''
    for datum in data:
        print "{0}: {1}".format(datum[0], datum[1])
        
years = {1996:1.46, 1997:1.43, 1998:1.4, 1999:1.38, 2000:1.33, 2001:1.3, 2002:1.28, 2003:1.25, 
         2004:1.22, 2005:1.18, 2006:1.14, 2007:1.11, 2008:1.07, 2009:1.07, 2010:1.05, 2011:1.02, 2012:1, 2013:1}
def adjustForInflation(value, year):
    '''Adjust the dollar value based on year
    Source (http://www.bls.gov/data/inflation_calculator.htm)
    '''
    if value == None:
        return
    return value * years[year]

In [27]:
query = """SELECT INSTNM, YEAR, 
                TUITIONFEE_IN, TUITIONFEE_OUT,
                DEBT_MDN, 
                mn_earn_wne_p10, md_earn_wne_p10, pct10_earn_wne_p10, pct25_earn_wne_p10, 
                pct75_earn_wne_p10, pct90_earn_wne_p10
            FROM Scorecard
            WHERE MAIN='Main campus'
              AND PREDDEG = 'Predominantly bachelor''s-degree granting'
              AND CCBASIC NOT LIKE '%Special Focus%'
        """
rawData = execute(query)

Now that I have the raw data I want to structure it in a JSON like fashion. instnm -> year -> data

In [11]:
treeStruct = {}

for row in rawData:
    instnm = row[0]
    year = row[1]
    
    if instnm not in treeStruct:
        treeStruct[instnm] = []
        
    treeStruct[instnm].append({year: {'tuition_in':row[2], 
                                        'tuition_out':row[3], 
                                        'debt_mdn':row[4], 
                                        'mn_earn':row[5], 
                                        'md_earn':row[6], 
                                        'md_earn_10':row[7], 
                                        'md_earn25':row[8], 
                                        'md_earn_75':row[9], 
                                        'md_earn_90':row[10]}})

In [12]:
treeStruct

{u'Wayne State College': [{2013: {'debt_mdn': 12000.0,
    'md_earn': None,
    'md_earn25': None,
    'md_earn_10': None,
    'md_earn_75': None,
    'md_earn_90': None,
    'mn_earn': None,
    'tuition_in': 5574,
    'tuition_out': 9774}}],
 u'Syracuse University': [{2013: {'debt_mdn': 27000.0,
    'md_earn': None,
    'md_earn25': None,
    'md_earn_10': None,
    'md_earn_75': None,
    'md_earn_90': None,
    'mn_earn': None,
    'tuition_in': 40458,
    'tuition_out': 40458}}],
 u'Baker University': [{2013: {'debt_mdn': 21353.0,
    'md_earn': None,
    'md_earn25': None,
    'md_earn_10': None,
    'md_earn_75': None,
    'md_earn_90': None,
    'mn_earn': None,
    'tuition_in': 25580,
    'tuition_out': 25580}}],
 u'Trinity Washington University': [{2013: {'debt_mdn': 16375.0,
    'md_earn': None,
    'md_earn25': None,
    'md_earn_10': None,
    'md_earn_75': None,
    'md_earn_90': None,
    'mn_earn': None,
    'tuition_in': 21630,
    'tuition_out': 21630}}],
 u'Californ

In [28]:
from sets import Set
years = Set()
y2013 = 0
for row in rawData:
    years.add(row[1])
    if row[1] == 2013:
        y2013 += 1
    
print str(years)
print len(rawData)
print y2013

Set([2013])
1477
