In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import math
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
#import all data sets
Math=pd.read_csv("data/Math.csv")
Charter = pd.read_csv("data/Math-Charter.csv")
Size = pd.read_csv("daClassSize.csv")
Survey = pd.read_csv("SurveyData.csv")
Demo = pd.read_csv("DemographicData.csv")
funding = pd.read_csv("fundingdf_new.csv")
Safety = pd.read_csv("School_Safety_Report.csv")

IOError: File Math.csv does not exist

In [None]:
#CLEAN UP CHARTER DATA
#drop irrelevant demographic info and years
Charter = Charter[Charter["Year"] == 2011]
Charterdf = Charter.drop("Demographic", axis = 1)
#set school type to charter because schools in charter.csv were only from charter schools
Charterdf["School Type"] = "Charter"
Charterdf.head()

In [None]:
#CLEAN UP MATH DATA
#drop irrelevant demographic info and years
Mathdf = Math.drop("Demographic", axis = 1)
Mathdf =  Mathdf[Mathdf["Year"] == 2011]
#set school type to public because schools in the math.csv were only from public schools
Mathdf["School Type"] = "Public"
Mathdf.head()

In [None]:
#merge math data and charter data
merge1 = Mathdf.append(Charterdf, ignore_index=True)
merge1.rename(columns={'Mean Scale Score':'Mean_Scale_Score'}, inplace=True)
# convert meanscalescores into ints
counter = 0
for i in merge1.iterrows():
    if i[1]["Mean_Scale_Score"] == "s":
        merge1.ix[counter, "Mean_Scale_Score"] = "NaN"
    else:
        merge1.ix[counter, "Mean_Scale_Score"]= int(i[1]["Mean_Scale_Score"])
    counter += 1
type(merge1.iloc[0]["Mean_Scale_Score"])
merge1.head()

In [None]:
#CLEAN UP SIZE DATA
#Standardize DBN numbers for class size data
Size["DBN"] = "N/A"
counter = 0
for i in Size.iterrows():
    #concatenate school district and code to form dbn numbers
    newdbn = "0"+str(i[1]["CSD"])+i[1]["SCHOOL CODE"]
    Size.ix[counter, "DBN"]= newdbn
    #take only sencond number of grades with a 0 added to the beginning
    if type(i[1]["GRADE "]) == float:
        grade = "N/A"
    elif len(i[1]["GRADE "]) == 2:
        grade = i[1]["GRADE "][1:]
    Size.ix[counter, "GRADE "] = grade
    if math.isnan((i[1]["AVERAGE CLASS SIZE"])):
        continue
    else:
        Size.ix[counter, "AVERAGE CLASS SIZE"] = float(i[1]["AVERAGE CLASS SIZE"])
    counter += 1

#Only keep relevant class size columns
sizeallcolumns = Size.columns 
sizedropcolumns = []
# print len(sizeallcolumns)
for title in sizeallcolumns:
    if title in ["SCHOOL NAME", "BOROUGH", "GRADE ", 'PROGRAM TYPE', "AVERAGE CLASS SIZE", "DBN"]:
        continue
    else:
        sizedropcolumns.append(title)

#Keep only gen ed samples to standardize classroom type we gather data from
Sizedf = Size.drop(sizedropcolumns, axis = 1)
Sizedf =  Sizedf[Sizedf["PROGRAM TYPE"] == "GEN ED"]

#Modify format of column names so they match others while merging
Sizedf.columns = ['School Name', 'Borough', 'Grade', 'Program Type', 'Average Class Size', 'DBN']
Sizedf.head()

In [None]:
#merge class size data into other merged data
merge2 = merge1.merge(Sizedf, on = ["DBN", "Grade"], how = "left")

In [None]:
#Clean up years for demographic data
Demog =  Demo[Demo["Year"] == "2010-11"]

#Only keep relevant demographic columns
demogallcolumns = Demog.columns 
demogdropcolumns = []
for title in demogallcolumns:
    if title in ["DBN", "School Name", "% Female", "% Male", "% Asian", "% Black", "% Hispanic", "% Other", "% White"]:
        continue
    else:
        demogdropcolumns.append(title)

#drop irrelvant columns
Demogdf = Demog.drop(demogdropcolumns, axis = 1)

#convert percentages into floats without percent sign
for i in xrange(0, len(Demogdf["DBN"])):
    Demogdf.iloc[i]["% Female"] = float(Demogdf.iloc[i]["% Female"][:-1])/100
    Demogdf.iloc[i]["% Male"] = float(Demogdf.iloc[i]["% Male"][:-1])/100
    Demogdf.iloc[i]["% Asian"] = float(Demogdf.iloc[i]["% Asian"][:-1])/100
    Demogdf.iloc[i]["% Black"] = float(Demogdf.iloc[i]["% Black"][:-1])/100
    Demogdf.iloc[i]["% Hispanic"] = float(Demogdf.iloc[i]["% Hispanic"][:-1])/100
    Demogdf.iloc[i]["% Other"] = float(Demogdf.iloc[i]["% Other"][:-1])/100
    Demogdf.iloc[i]["% White"] = float(Demogdf.iloc[i]["% White"][:-1])/100


In [None]:
#merge demographic data with other merged data
merge3 = merge2.merge(Demogdf, on = "DBN", how = "left")
merge3.head()

In [None]:
#clean safety data
#keep only relevant columns
safetyallcolumns = Safety.columns 
safetydropcolumns = []
for title in safetyallcolumns:
    if title in ["DBN", "AvgOfMajor N", "AvgOfVio N", "AvgOfNoCrim N"]:
        continue
    else:
        safetydropcolumns.append(title)
Safetydf = Safety.drop(safetydropcolumns, axis = 1)
Safetydf.head()

In [None]:
#merge safety data with other data
merge4 = merge3.merge(Safetydf, on = "DBN", how = "left")
merge4.head()

In [None]:
#CLEAN UP SURVEY DATA
# SurveyWhole = Survey.append(Survey75, ignore_index=True)
allcolumns = Survey.columns 
dropcolumns = []
for title in allcolumns:
    #create array of columns to drop in dropcolumns array
    if title in ["dbn", "schoolname", "eng_p_11", "eng_s_11", "eng_t_11", "aca_p_11", "aca_t_11", "aca_s_11"]:
        continue   
    else:
        dropcolumns.append(title)
#drop irrelevant columns and edit column format so they can merge
Surveydf = Survey.drop(dropcolumns, axis = 1)
Surveydf.rename(columns={'dbn':'DBN'}, inplace=True)
Surveydf.head()


In [None]:
merge5 = merge4.merge(Surveydf, on = "DBN", how = "left")
merge5

In [None]:
#Clean up funding columns
funding.columns = ["DBN", "Funding"]
funding.head()
# for i in funding.iterrows():
#     print i[1]
#     funding = funding.replace("$", "")
#     funding = funding.replace(",", "")
# type(mergedf.funding)

In [None]:
mergedf = merge5.merge(funding, on = "DBN", how = "left")
mergedf.columns

In [None]:
array = []
for i in mergedf.iterrows():
    if i[1]["Mean_Scale_Score"] != "NaN":
        array.append(i[1]["Mean_Scale_Score"])
counter = 0
for i in mergedf.iterrows():
    if i[1]["Mean_Scale_Score"] == "NaN":
#         print "hi"
        mergedf.ix[counter, "Mean_Scale_Score"] = np.mean(array)
    counter +=1
        
array_size = []
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["Average Class Size"]):
#         print i
#         break
        array_size.append(i[1]["Average Class Size"])
# print array_size
counter1 = 0
for i in mergedf.iterrows():
#     if i[1]["Average Class Size"] == "NaN"or i[1]["Average Class Size"] == "nan":
    if np.isnan(i[1]["Average Class Size"]):
#         print "hi"
        mergedf.ix[counter1, "Average Class Size"] = np.mean(array_size)
    counter1 +=1

In [None]:
array_female = []
counter2 = 0
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["% Female"]):
        array_female.append(i[1]["% Female"])
print np.mean(array_female)
for i in mergedf.iterrows():
    if np.isnan(i[1]["% Female"]):
        mergedf.ix[counter2, "% Female"] = np.mean(array_female)
    counter2 +=1
array_asian = []
counter3 = 0
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["% Asian"]):
#         print "hi"
        array_asian.append(i[1]["% Asian"])
for i in mergedf.iterrows():
    if np.isnan(i[1]["% Asian"]):
        mergedf.ix[counter3, "% Asian"] = np.mean(array_asian)
    counter3 +=1
array_black = []
counter4 = 0
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["% Black"]):
#         print "hi"
        array_black.append(i[1]["% Black"])
# print np.mean(array_black)
for i in mergedf.iterrows():
    if np.isnan(i[1]["% Black"]):
        mergedf.ix[counter4, "% Black"] = np.mean(array_black)
    counter4 +=1
array_his = []
counter5 = 0
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["% Hispanic"]):
#         print "hi"
        array_his.append(i[1]["% Hispanic"])
print np.mean(array_his)
for i in mergedf.iterrows():
    if np.isnan(i[1]["% Hispanic"]):
        mergedf.ix[counter5, "% Hispanic"] = np.mean(array_his)
    counter5 +=1
array_crime = []
counter6 = 0
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["AvgOfNoCrim N"]):
#         print "hi"
        array_crime.append(i[1]["AvgOfNoCrim N"])
print np.mean(array_crime)
for i in mergedf.iterrows():
    if np.isnan(i[1]["AvgOfNoCrim N"]):
        mergedf.ix[counter6, "AvgOfNoCrim N"] = np.mean(array_crime)
    counter6 +=1

In [None]:
# print array_size
# for i in mergedf.iterrows():
#     if i[1]["Mean_Scale_Score"] == "NaN":
#         print "hi"
# print np.mean(array)
counter2 = 0
for i in mergedf.iterrows():
    mergedf.ix[counter2, "Funding"] = int(i[1]["Funding"])
    counter += 1  

In [None]:
mergedf.columns = ['DBN', 'Grade', 'Year', 'Number_Tested', 'Mean_Scale_Score', 'Num_Level1', 'Pct_Level1', 'Num_Level2', 'Pct_Level2', 'Num_Level3', 'Pct_Level3', 'Num_Level4', 'Pct_Level4', 'Num_Level3_and4', 'Pct_Level3_and4', 'School_Type', 'School_Name_x', 'Borough', 'Program_Type', 'Average_Class_Size', 'School_Name_y', 'Female_Percentage', 'Male_Percentage', 'Asian_Percentage', 'Black_Percentage', 'Hispanic_Percentage', 'Other_Percentage', 'White_Percentage', 'Avg_Major_N', 'Avg_No_Crim_N', 'Avg_Vio_N', 'School_Name', 'Eng_p_11', 'Aca_p_11', 'Eng_t_11', 'Aca_t_11', 'Eng_s_11', 'Aca_s_11', 'Funding']
print mergedf.shape

In [None]:
mergedf.to_csv("mergedfupdate_new.csv", index=False)

In [None]:
for i in mergedf["Avg_No_Crim_N"]:
    if i == "NaN":
        print "Hi"