In [52]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import math
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [53]:
#import all data sets
Math=pd.read_csv("data/Math.csv")
Charter = pd.read_csv("data/Math-Charter.csv")
Size = pd.read_csv("data/ClassSize.csv")
Survey = pd.read_csv("data/SurveyData.csv")
Demo = pd.read_csv("data/DemographicData.csv")
funding = pd.read_csv("data/fundingdf.csv")
Safety = pd.read_csv("data/School_Safety_Report.csv")

In [54]:
#CLEAN UP CHARTER DATA
#drop irrelevant demographic info and years
Charter = Charter[Charter["Year"] == 2011]
Charterdf = Charter.drop("Demographic", axis = 1)
#set school type to charter because schools in charter.csv were only from charter schools
Charterdf["School Type"] = "Charter"
Charterdf.head()

Unnamed: 0,DBN,Grade,Year,Number Tested,Mean Scale Score,Num Level 1,Pct Level 1,Num Level 2,Pct Level 2,Num Level 3,Pct Level 3,Num Level 4,Pct Level 4,Num Level 3 and 4,Pct Level 3 and 4,School Type
5,84K355,5,2011,79,696,1,1.3,12,15.2,39,49.4,27,34.2,66,83.5,Charter
11,84K355,6,2011,71,714,0,0.0,2,2.8,14,19.7,55,77.5,69,97.2,Charter
16,84K355,7,2011,68,706,0,0.0,0,0.0,14,20.6,54,79.4,68,100.0,Charter
20,84K355,8,2011,55,718,0,0.0,0,0.0,10,18.2,45,81.8,55,100.0,Charter
27,84K355,All Grades,2011,273,708,1,0.4,14,5.1,77,28.2,181,66.3,258,94.5,Charter


In [55]:
#CLEAN UP MATH DATA
#drop irrelevant demographic info and years
Mathdf = Math.drop("Demographic", axis = 1)
Mathdf =  Mathdf[Mathdf["Year"] == 2011]
#set school type to public because schools in the math.csv were only from public schools
Mathdf["School Type"] = "Public"
Mathdf.head()

Unnamed: 0,DBN,Grade,Year,Number Tested,Mean Scale Score,Num Level 1,Pct Level 1,Num Level 2,Pct Level 2,Num Level 3,Pct Level 3,Num Level 4,Pct Level 4,Num Level 3 and 4,Pct Level 3 and 4,School Type
5,01M015,3,2011,28,671,10,35.7,13,46.4,5,17.9,0,0.0,5,17.9,Public
12,01M015,4,2011,28,668,3,10.7,14,50.0,9,32.1,2,7.1,11,39.3,Public
19,01M015,5,2011,25,667,5,20.0,8,32.0,12,48.0,0,0.0,12,48.0,Public
27,01M015,All Grades,2011,81,669,18,22.2,35,43.2,26,32.1,2,2.5,28,34.6,Public
34,01M019,3,2011,34,679,2,5.9,23,67.6,7,20.6,2,5.9,9,26.5,Public


In [56]:
#merge math data and charter data
merge1 = Mathdf.append(Charterdf, ignore_index=True)
merge1.rename(columns={'Mean Scale Score':'Mean_Scale_Score'}, inplace=True)
# convert meanscalescores into ints
counter = 0
for i in merge1.iterrows():
    if i[1]["Mean_Scale_Score"] == "s":
        merge1.ix[counter, "Mean_Scale_Score"] = "NaN"
    else:
        merge1.ix[counter, "Mean_Scale_Score"]= int(i[1]["Mean_Scale_Score"])
    counter += 1
type(merge1.iloc[0]["Mean_Scale_Score"])
merge1.head()

Unnamed: 0,DBN,Grade,Year,Number Tested,Mean_Scale_Score,Num Level 1,Pct Level 1,Num Level 2,Pct Level 2,Num Level 3,Pct Level 3,Num Level 4,Pct Level 4,Num Level 3 and 4,Pct Level 3 and 4,School Type
0,01M015,3,2011,28,671,10,35.7,13,46.4,5,17.9,0,0.0,5,17.9,Public
1,01M015,4,2011,28,668,3,10.7,14,50.0,9,32.1,2,7.1,11,39.3,Public
2,01M015,5,2011,25,667,5,20.0,8,32.0,12,48.0,0,0.0,12,48.0,Public
3,01M015,All Grades,2011,81,669,18,22.2,35,43.2,26,32.1,2,2.5,28,34.6,Public
4,01M019,3,2011,34,679,2,5.9,23,67.6,7,20.6,2,5.9,9,26.5,Public


In [57]:
#CLEAN UP SIZE DATA
#Standardize DBN numbers for class size data
Size["DBN"] = "N/A"
counter = 0
for i in Size.iterrows():
    #concatenate school district and code to form dbn numbers
    newdbn = "0"+str(i[1]["CSD"])+i[1]["SCHOOL CODE"]
    Size.ix[counter, "DBN"]= newdbn
    #take only sencond number of grades with a 0 added to the beginning
    if type(i[1]["GRADE "]) == float:
        grade = "N/A"
    elif len(i[1]["GRADE "]) == 2:
        grade = i[1]["GRADE "][1:]
    Size.ix[counter, "GRADE "] = grade
    if math.isnan((i[1]["AVERAGE CLASS SIZE"])):
        continue
    else:
        Size.ix[counter, "AVERAGE CLASS SIZE"] = float(i[1]["AVERAGE CLASS SIZE"])
    counter += 1

#Only keep relevant class size columns
sizeallcolumns = Size.columns 
sizedropcolumns = []
# print len(sizeallcolumns)
for title in sizeallcolumns:
    if title in ["SCHOOL NAME", "BOROUGH", "GRADE ", 'PROGRAM TYPE', "AVERAGE CLASS SIZE", "DBN"]:
        continue
    else:
        sizedropcolumns.append(title)

#Keep only gen ed samples to standardize classroom type we gather data from
Sizedf = Size.drop(sizedropcolumns, axis = 1)
Sizedf =  Sizedf[Sizedf["PROGRAM TYPE"] == "GEN ED"]

#Modify format of column names so they match others while merging
Sizedf.columns = ['School Name', 'Borough', 'Grade', 'Program Type', 'Average Class Size', 'DBN']
Sizedf.head()

Unnamed: 0,School Name,Borough,Grade,Program Type,Average Class Size,DBN
0,M,P.S. 015 Roberto Clemente,K,GEN ED,19,01M015
2,M,P.S. 015 Roberto Clemente,1,GEN ED,17,01M015
4,M,P.S. 015 Roberto Clemente,2,GEN ED,15,01M015
6,M,P.S. 015 Roberto Clemente,3,GEN ED,12,01M015
8,M,P.S. 015 Roberto Clemente,4,GEN ED,13,01M015


In [58]:
#merge class size data into other merged data
merge2 = merge1.merge(Sizedf, on = ["DBN", "Grade"], how = "left")

In [59]:
#Clean up years for demographic data
Demog =  Demo[Demo["Year"] == "2010-11"]

#Only keep relevant demographic columns
demogallcolumns = Demog.columns 
demogdropcolumns = []
for title in demogallcolumns:
    if title in ["DBN", "School Name", "% Female", "% Male", "% Asian", "% Black", "% Hispanic", "% Other", "% White"]:
        continue
    else:
        demogdropcolumns.append(title)

#drop irrelvant columns
Demogdf = Demog.drop(demogdropcolumns, axis = 1)

#convert percentages into floats without percent sign
for i in xrange(0, len(Demogdf["DBN"])):
    Demogdf.iloc[i]["% Female"] = float(Demogdf.iloc[i]["% Female"][:-1])/100
    Demogdf.iloc[i]["% Male"] = float(Demogdf.iloc[i]["% Male"][:-1])/100
    Demogdf.iloc[i]["% Asian"] = float(Demogdf.iloc[i]["% Asian"][:-1])/100
    Demogdf.iloc[i]["% Black"] = float(Demogdf.iloc[i]["% Black"][:-1])/100
    Demogdf.iloc[i]["% Hispanic"] = float(Demogdf.iloc[i]["% Hispanic"][:-1])/100
    Demogdf.iloc[i]["% Other"] = float(Demogdf.iloc[i]["% Other"][:-1])/100
    Demogdf.iloc[i]["% White"] = float(Demogdf.iloc[i]["% White"][:-1])/100


In [60]:
#merge demographic data with other merged data
merge3 = merge2.merge(Demogdf, on = "DBN", how = "left")
merge3.head()

Unnamed: 0,DBN,Grade,Year,Number Tested,Mean_Scale_Score,Num Level 1,Pct Level 1,Num Level 2,Pct Level 2,Num Level 3,Pct Level 3,Num Level 4,Pct Level 4,Num Level 3 and 4,Pct Level 3 and 4,School Type,School Name_x,Borough,Program Type,Average Class Size,School Name_y,% Female,% Male,% Asian,% Black,% Hispanic,% Other,% White
0,01M015,3,2011,28,671,10,35.7,13,46.4,5,17.9,0,0.0,5,17.9,Public,M,P.S. 015 Roberto Clemente,GEN ED,12.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02
1,01M015,4,2011,28,668,3,10.7,14,50.0,9,32.1,2,7.1,11,39.3,Public,M,P.S. 015 Roberto Clemente,GEN ED,13.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02
2,01M015,5,2011,25,667,5,20.0,8,32.0,12,48.0,0,0.0,12,48.0,Public,M,P.S. 015 Roberto Clemente,GEN ED,27.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02
3,01M015,All Grades,2011,81,669,18,22.2,35,43.2,26,32.1,2,2.5,28,34.6,Public,,,,,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02
4,01M019,3,2011,34,679,2,5.9,23,67.6,7,20.6,2,5.9,9,26.5,Public,M,P.S. 019 Asher Levy,GEN ED,17.5,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095


In [61]:
#clean safety data
#keep only relevant columns
safetyallcolumns = Safety.columns 
safetydropcolumns = []
for title in safetyallcolumns:
    if title in ["DBN", "AvgOfMajor N", "AvgOfVio N", "AvgOfNoCrim N"]:
        continue
    else:
        safetydropcolumns.append(title)
Safetydf = Safety.drop(safetydropcolumns, axis = 1)
Safetydf.head()

Unnamed: 0,DBN,AvgOfMajor N,AvgOfNoCrim N,AvgOfVio N
0,15K001,0.86,5.55,1.29
1,17K002,,,
2,75K141,,,
3,84K704,,,
4,,0.52,2.49,0.75


In [62]:
#merge safety data with other data
merge4 = merge3.merge(Safetydf, on = "DBN", how = "left")
merge4.head()

Unnamed: 0,DBN,Grade,Year,Number Tested,Mean_Scale_Score,Num Level 1,Pct Level 1,Num Level 2,Pct Level 2,Num Level 3,Pct Level 3,Num Level 4,Pct Level 4,Num Level 3 and 4,Pct Level 3 and 4,School Type,School Name_x,Borough,Program Type,Average Class Size,School Name_y,% Female,% Male,% Asian,% Black,% Hispanic,% Other,% White,AvgOfMajor N,AvgOfNoCrim N,AvgOfVio N
0,01M015,3,2011,28,671,10,35.7,13,46.4,5,17.9,0,0.0,5,17.9,Public,M,P.S. 015 Roberto Clemente,GEN ED,12.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41
1,01M015,4,2011,28,668,3,10.7,14,50.0,9,32.1,2,7.1,11,39.3,Public,M,P.S. 015 Roberto Clemente,GEN ED,13.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41
2,01M015,5,2011,25,667,5,20.0,8,32.0,12,48.0,0,0.0,12,48.0,Public,M,P.S. 015 Roberto Clemente,GEN ED,27.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41
3,01M015,All Grades,2011,81,669,18,22.2,35,43.2,26,32.1,2,2.5,28,34.6,Public,,,,,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41
4,01M019,3,2011,34,679,2,5.9,23,67.6,7,20.6,2,5.9,9,26.5,Public,M,P.S. 019 Asher Levy,GEN ED,17.5,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095,,,


In [63]:
#CLEAN UP SURVEY DATA
# SurveyWhole = Survey.append(Survey75, ignore_index=True)
allcolumns = Survey.columns 
dropcolumns = []
for title in allcolumns:
    #create array of columns to drop in dropcolumns array
    if title in ["dbn", "schoolname", "eng_p_11", "eng_s_11", "eng_t_11", "aca_p_11", "aca_t_11", "aca_s_11"]:
        continue   
    else:
        dropcolumns.append(title)
#drop irrelevant columns and edit column format so they can merge
Surveydf = Survey.drop(dropcolumns, axis = 1)
Surveydf.rename(columns={'dbn':'DBN'}, inplace=True)
Surveydf.head()


Unnamed: 0,DBN,schoolname,eng_p_11,aca_p_11,eng_t_11,aca_t_11,eng_s_11,aca_s_11
0,01M015,P.S. 015 Roberto Clemente,7.5,7.8,7.6,7.9,,
1,01M019,P.S. 019 Asher Levy,7.6,7.8,8.9,9.1,,
2,01M020,P.S. 020 Anna Silver,8.3,8.6,6.8,7.5,,
3,01M034,P.S. 034 Franklin D. Roosevelt,8.0,8.5,6.8,7.8,6.5,7.4
4,01M063,P.S. 063 William McKinley,8.1,7.9,7.8,8.1,,


In [64]:
merge5 = merge4.merge(Surveydf, on = "DBN", how = "left")
merge5

Unnamed: 0,DBN,Grade,Year,Number Tested,Mean_Scale_Score,Num Level 1,Pct Level 1,Num Level 2,Pct Level 2,Num Level 3,Pct Level 3,Num Level 4,Pct Level 4,Num Level 3 and 4,Pct Level 3 and 4,School Type,School Name_x,Borough,Program Type,Average Class Size,School Name_y,% Female,% Male,% Asian,% Black,% Hispanic,% Other,% White,AvgOfMajor N,AvgOfNoCrim N,AvgOfVio N,schoolname,eng_p_11,aca_p_11,eng_t_11,aca_t_11,eng_s_11,aca_s_11
0,01M015,3,2011,28,671,10,35.7,13,46.4,5,17.9,0,0,5,17.9,Public,M,P.S. 015 Roberto Clemente,GEN ED,12.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41,P.S. 015 Roberto Clemente,7.5,7.8,7.6,7.9,,
1,01M015,4,2011,28,668,3,10.7,14,50,9,32.1,2,7.1,11,39.3,Public,M,P.S. 015 Roberto Clemente,GEN ED,13.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41,P.S. 015 Roberto Clemente,7.5,7.8,7.6,7.9,,
2,01M015,5,2011,25,667,5,20,8,32,12,48,0,0,12,48,Public,M,P.S. 015 Roberto Clemente,GEN ED,27.0,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41,P.S. 015 Roberto Clemente,7.5,7.8,7.6,7.9,,
3,01M015,All Grades,2011,81,669,18,22.2,35,43.2,26,32.1,2,2.5,28,34.6,Public,,,,,P.S. 015 Roberto Clemente,0.443,0.557,0.064,0.369,0.542,0.005,0.02,0.43,1.23,0.41,P.S. 015 Roberto Clemente,7.5,7.8,7.6,7.9,,
4,01M019,3,2011,34,679,2,5.9,23,67.6,7,20.6,2,5.9,9,26.5,Public,M,P.S. 019 Asher Levy,GEN ED,17.5,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095,,,,P.S. 019 Asher Levy,7.6,7.8,8.9,9.1,,
5,01M019,4,2011,39,681,2,5.1,11,28.2,22,56.4,4,10.3,26,66.7,Public,M,P.S. 019 Asher Levy,GEN ED,22.0,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095,,,,P.S. 019 Asher Levy,7.6,7.8,8.9,9.1,,
6,01M019,4,2011,39,681,2,5.1,11,28.2,22,56.4,4,10.3,26,66.7,Public,M,P.S. 019 Asher Levy,GEN ED,24.0,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095,,,,P.S. 019 Asher Levy,7.6,7.8,8.9,9.1,,
7,01M019,5,2011,39,688,0,0,12,30.8,19,48.7,8,20.5,27,69.2,Public,M,P.S. 019 Asher Levy,GEN ED,21.0,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095,,,,P.S. 019 Asher Levy,7.6,7.8,8.9,9.1,,
8,01M019,All Grades,2011,112,683,4,3.6,46,41.1,48,42.9,14,12.5,62,55.4,Public,,,,,P.S. 019 Asher Levy,0.527,0.473,0.146,0.271,0.473,0.015,0.095,,,,P.S. 019 Asher Levy,7.6,7.8,8.9,9.1,,
9,01M020,3,2011,81,682,7,8.6,29,35.8,39,48.1,6,7.4,45,55.6,Public,M,P.S. 020 Anna Silver,GEN ED,19.0,P.S. 020 Anna Silver,0.467,0.533,0.282,0.082,0.598,0.011,0.027,0.33,1.76,0.59,P.S. 020 Anna Silver,8.3,8.6,6.8,7.5,,


In [65]:
#Clean up funding columns
funding.columns = ["DBN", "Funding"]
funding.head()
# for i in funding.iterrows():
#     print i[1]
#     funding = funding.replace("$", "")
#     funding = funding.replace(",", "")
# type(mergedf.funding)

Unnamed: 0,DBN,Funding
0,10X225,3001810
1,14K318,7720906
2,10X226,2722452
3,01M110,2559804
4,05M197,2270648


In [66]:
mergedf = merge5.merge(funding, on = "DBN", how = "left")
mergedf.columns

Index([u'DBN', u'Grade', u'Year', u'Number Tested', u'Mean_Scale_Score', u'Num Level 1', u'Pct Level 1', u'Num Level 2', u'Pct Level 2', u'Num Level 3', u'Pct Level 3', u'Num Level 4', u'Pct Level 4', u'Num Level 3 and 4', u'Pct Level 3 and 4', u'School Type', u'School Name_x', u'Borough', u'Program Type', u'Average Class Size', u'School Name_y', u'% Female', u'% Male', u'% Asian', u'% Black', u'% Hispanic', u'% Other', u'% White', u'AvgOfMajor N', u'AvgOfNoCrim N', u'AvgOfVio N',
       u'schoolname', u'eng_p_11', u'aca_p_11', u'eng_t_11', u'aca_t_11', u'eng_s_11', u'aca_s_11', u'Funding'],
      dtype='object')

In [67]:
array = []
for i in mergedf.iterrows():
    if i[1]["Mean_Scale_Score"] != "NaN":
        array.append(i[1]["Mean_Scale_Score"])
counter = 0
for i in mergedf.iterrows():
    if i[1]["Mean_Scale_Score"] == "NaN":
#         print "hi"
        mergedf.ix[counter, "Mean_Scale_Score"] = np.mean(array)
    counter +=1
        
array_size = []
for i in mergedf.iterrows():
    if ~np.isnan(i[1]["Average Class Size"]):
#         print i
#         break
        array_size.append(i[1]["Average Class Size"])
# print array_size
counter1 = 0
for i in mergedf.iterrows():
#     if i[1]["Average Class Size"] == "NaN"or i[1]["Average Class Size"] == "nan":
    if np.isnan(i[1]["Average Class Size"]):
#         print "hi"
        mergedf.ix[counter1, "Average Class Size"] = np.mean(array_size)
    counter1 +=1

In [68]:
# mergedf =  mergedf[mergedf["Funding"] != 0]
# array_female = []
# counter2 = 0
# for i in mergedf.iterrows():
#     if ~np.isnan(i[1]["% Female"]):
#         array_female.append(i[1]["% Female"])
# print np.mean(array_female)
# for i in mergedf.iterrows():
#     if np.isnan(i[1]["% Female"]):
#         mergedf.ix[counter2, "% Female"] = np.mean(array_female)
#     counter2 +=1
# array_asian = []
# counter3 = 0
# for i in mergedf.iterrows():
#     if ~np.isnan(i[1]["% Asian"]):
# #         print "hi"
#         array_asian.append(i[1]["% Asian"])
# for i in mergedf.iterrows():
#     if np.isnan(i[1]["% Asian"]):
#         mergedf.ix[counter3, "% Asian"] = np.mean(array_asian)
#     counter3 +=1
# array_black = []
# counter4 = 0
# for i in mergedf.iterrows():
#     if ~np.isnan(i[1]["% Black"]):
# #         print "hi"
#         array_black.append(i[1]["% Black"])
# # print np.mean(array_black)
# for i in mergedf.iterrows():
#     if np.isnan(i[1]["% Black"]):
#         mergedf.ix[counter4, "% Black"] = np.mean(array_black)
#     counter4 +=1
# array_his = []
# counter5 = 0
# for i in mergedf.iterrows():
#     if ~np.isnan(i[1]["% Hispanic"]):
# #         print "hi"
#         array_his.append(i[1]["% Hispanic"])
# print np.mean(array_his)
# for i in mergedf.iterrows():
#     if np.isnan(i[1]["% Hispanic"]):
#         mergedf.ix[counter5, "% Hispanic"] = np.mean(array_his)
#     counter5 +=1
# array_crime = []
# counter6 = 0
# for i in mergedf.iterrows():
#     if ~np.isnan(i[1]["AvgOfNoCrim N"]):
# #         print "hi"
#         array_crime.append(i[1]["AvgOfNoCrim N"])
# print np.mean(array_crime)
# for i in mergedf.iterrows():
#     if np.isnan(i[1]["AvgOfNoCrim N"]):
#         mergedf.ix[counter6, "AvgOfNoCrim N"] = np.mean(array_crime)
#     counter6 +=1

0.494727841678
0.445827131258
2.48206225681


In [74]:
mergedf.columns = ['DBN', 'Grade', 'Year', 'Number_Tested', 'Mean_Scale_Score', 'Num_Level1', 'Pct_Level1', 'Num_Level2', 'Pct_Level2', 'Num_Level3', 'Pct_Level3', 'Num_Level4', 'Pct_Level4', 'Num_Level3_and4', 'Pct_Level3_and4', 'School_Type', 'School_Name_x', 'Borough', 'Program_Type', 'Average_Class_Size', 'School_Name_y', 'Female_Percentage', 'Male_Percentage', 'Asian_Percentage', 'Black_Percentage', 'Hispanic_Percentage', 'Other_Percentage', 'White_Percentage', 'Avg_Major_N', 'Avg_No_Crim_N', 'Avg_Vio_N', 'School_Name', 'Eng_p_11', 'Aca_p_11', 'Eng_t_11', 'Aca_t_11', 'Eng_s_11', 'Aca_s_11', 'Funding']
print mergedf.shape

(6041, 39)


In [75]:
mergedf =  mergedf[mergedf["Funding"] != 0]
counter = 0
array_crime = []
for i in x.iterrows():
    if ~np.isnan(i[1]["Avg_No_Crim_N"]) and (i[1]["Avg_No_Crim_N"] != "NaN"):
#         print "hi"
        array_crime.append(i[1]["Avg_No_Crim_N"])
print np.mean(array_crime)
for i in mergedf.iterrows():
    if np.isnan(i[1]["Avg_No_Crim_N"]):
        x.ix[counter, "Avg_No_Crim_N"] = np.mean(array_crime)
    counter +=1
counter1 = 0
array_f = []
for i in x.iterrows():
    if ~np.isnan(i[1]["Female_Percentage"]):
#         print "hi"
        array_f.append(i[1]["Female_Percentage"])
print np.mean(array_f)
for i in mergedf.iterrows():
    if np.isnan(i[1]["Female_Percentage"]):
        x.ix[counter1, "Female_Percentage"] = np.mean(array_f)
    counter1 +=1
counter2 = 0
array_a = []
for i in x.iterrows():
    if ~np.isnan(i[1]["Asian_Percentage"]):
#         print "hi"
        array_a.append(i[1]["Asian_Percentage"])
print np.mean(array_a)
for i in mergedf.iterrows():
    if np.isnan(i[1]["Asian_Percentage"]):
        x.ix[counter2, "Asian_Percentage"] = np.mean(array_a)
    counter2 +=1
counter3 = 0
array_b = []
for i in x.iterrows():
    if ~np.isnan(i[1]["Black_Percentage"]):
#         print "hi"
        array_b.append(i[1]["Black_Percentage"])
print np.mean(array_b)
for i in mergedf.iterrows():
    if np.isnan(i[1]["Black_Percentage"]):
        x.ix[counter3, "Black_Percentage"] = np.mean(array_b)
    counter3 +=1
counter4 = 0
array_h = []
for i in x.iterrows():
    if ~np.isnan(i[1]["Hispanic_Percentage"]):
#         print "hi"
        array_h.append(i[1]["Hispanic_Percentage"])
print np.mean(array_h)
for i in mergedf.iterrows():
    if np.isnan(i[1]["Hispanic_Percentage"]):
        x.ix[counter4, "Hispanic_Percentage"] = np.mean(array_h)
    counter4 +=1

2.49623074278
0.494947557753
0.107307917339
0.312163217492
0.447166457388


In [69]:
# print array_size
# for i in mergedf.iterrows():
#     if i[1]["Mean_Scale_Score"] == "NaN":
# #         print "hi"
# # print np.mean(array)
# counter2 = 0
# for i in mergedf.iterrows():
#     mergedf.ix[counter2, "Funding"] = int(i[1]["Funding"])
#     counter += 1  


In [76]:
print mergedf["Female_Percentage"]
mergedf.to_csv("mergedfupdate_new.csv", index=False)

0       0.443
1       0.443
2       0.443
3       0.443
4       0.527
5       0.527
6       0.527
7       0.527
8       0.527
9       0.467
10      0.467
11      0.467
12      0.467
13      0.467
14      0.494
15      0.494
16      0.494
17      0.494
18      0.494
19      0.494
20      0.494
21      0.494
22      0.494
23      0.494
24      0.494
25      0.494
26      0.425
27      0.425
28      0.425
29      0.425
        ...  
551       NaN
552       NaN
553       NaN
554       NaN
555       NaN
556       NaN
557       NaN
1210      NaN
1211      NaN
1212      NaN
1213      NaN
1218      NaN
1219      NaN
1220      NaN
1221      NaN
1222      NaN
1223      NaN
1224      NaN
1548      NaN
1549      NaN
1550      NaN
1551      NaN
1552      NaN
2312      NaN
2313      NaN
2380      NaN
2381      NaN
3991      NaN
3992      NaN
3993      NaN
Name: Female_Percentage, dtype: object


In [77]:
for i in mergedf["Avg_No_Crim_N"]:
    if i == "NaN":
        print "Hi"

In [78]:
for i in mergedf["Female_Percentage"]:
    if np.isnan(i):
        print "HI"

allcolumns = mergedf.columns 
dropcolumns = []
for title in allcolumns:
    #create array of columns to drop in dropcolumns array
    if title in ["Avg_No_Crim_N", "Female_Percentage", "Asian_Percentage", "Black_Percentage", "Hispanic_Percentage"]:
        continue   
    else:
        dropcolumns.append(title)
#drop irrelevant columns and edit column format so they can merge
x = mergedf.drop(dropcolumns, axis = 1)

x["Female_Percentage"]

HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI
HI


0       0.443
1       0.443
2       0.443
3       0.443
4       0.527
5       0.527
6       0.527
7       0.527
8       0.527
9       0.467
10      0.467
11      0.467
12      0.467
13      0.467
14      0.494
15      0.494
16      0.494
17      0.494
18      0.494
19      0.494
20      0.494
21      0.494
22      0.494
23      0.494
24      0.494
25      0.494
26      0.425
27      0.425
28      0.425
29      0.425
        ...  
551       NaN
552       NaN
553       NaN
554       NaN
555       NaN
556       NaN
557       NaN
1210      NaN
1211      NaN
1212      NaN
1213      NaN
1218      NaN
1219      NaN
1220      NaN
1221      NaN
1222      NaN
1223      NaN
1224      NaN
1548      NaN
1549      NaN
1550      NaN
1551      NaN
1552      NaN
2312      NaN
2313      NaN
2380      NaN
2381      NaN
3991      NaN
3992      NaN
3993      NaN
Name: Female_Percentage, dtype: object