In [1]:
import pandas as pd 
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
#import pylab as plt
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import statsmodels.api as sm
from sklearn.cross_validation import KFold
import statsmodels.tools
%matplotlib inline

###This notebook will focus on the analytical part of the 311 demographics analysis: that is, multilinear regressions between resident and working population socio-demographic attributes and  the number of 311 calls by type per capita, at the NTA level will be performed and analyzed

The first step of this process, is to perform linear regression with the demographical features as variables and the types of 311 services requests as dependant variables.

Outcomes and variables of this notebook:

1)  regresions_filtered is an internal dictionary: the types of requests are the keys, and the group of variables which after the cross validation got an R square value greater than 0.2 are elements of each key.

2) outcomes/table_1.csv is a csv fail, containing the type of service request, the group of variables and the R square value of the OLS regression after the cross validation

3)results_phase1 is an internal dictionary. The structure is: results_phase1[type_of_request][group_of_variables] and contains the R square value of each regression as well as a table with the values of the coefficients of the regression associated with each variable under the cathegory (coefficients under the 'mean' column and the standard deviation, obtained from the width of the confidence intervals, unter the std_dv column)

4) outcomes/user_profiles.csv and outcomes/user_profiles_coefficients.csv contain a table each with the type of requests as index and the group of complain as column. The value on each cell represents the variable associated with the highest coefficient on the OLS regression (in the file user_profiles.csv) and the value of this coefficient (in the file user_profiles_coefficients.csv )

5) OLS_regressions is an internal dicionary. It follows the structure OLS_regressions[type_of_request][group_of_variables]['total_var' or 'prediction']. 'total_var' contains the standard deviation of the group of variables (sum of the stand. dev from the  results_phase1 dictionary), while 'prediction' containg a table comparing the observed values and the predicted values for the number of requests for each type of request, following the regression values obtained for each group of variables.

6) At the end of the notebook, avg_predicionts_dict is a dicionary that averages the different predictions for each type of request. avg_predicionts_dict[type_of_request] gives access to a table with the observed and averaged predicted number of requests for each NTA.

7) In order to provide a better way to understand and to communicate the findings of step 6, outputs/predicted_values.csv and
outputs/observed_values.csv are two files that contain the predicted number of requests for each type as predicted by our model, and the observed values. 


In [2]:
#Upload the working population attributes, and the resident population attributes at the CT level.
#We keep both datasets separated in order to generate separate regressions per type of features
demographics_NTA_NYC_residents=pd.read_csv('datasets/demographics_nta_NYC_residents_compiled.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
demographics_NTA_NYC_workers=pd.read_csv('datasets/demographics_nta_NYC_workers_compiled.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
demographics_NTA_NYC=pd.merge(demographics_NTA_NYC_residents,demographics_NTA_NYC_workers,on='Neighborhood',how='inner')
print len(demographics_NTA_NYC_residents),len(demographics_NTA_NYC_workers),len(demographics_NTA_NYC)

195 194 194


In [3]:
#Upload the 311 calls by type, normalized by total resident population
calls_bytype_normalized=pd.read_csv('datasets/Call by type without normalization - NTA level.csv').drop('Unnamed: 0', axis=1)

In [4]:
#demographics_NTA_NYC_workers.columns

In [5]:
#Delete the total population from the dataset since we are not going to use it
#del(calls_bytype_normalized['Total Population'])

In [6]:
#callsbytype_attributes will be a dataframe combining all the information (demographics + calls by type)
callsbytype_attributes=pd.merge(calls_bytype_normalized,demographics_NTA_NYC, on='Neighborhood',how='inner')
print len(callsbytype_attributes), len(calls_bytype_normalized),len(demographics_NTA_NYC)

190 190 194


In [7]:
#callsbytype_attributes will be a dataframe combining all the information (demographics + calls by type)
#we created to separate frames, one for residents and one for workers
callsbytype_attributes_residents=pd.merge(calls_bytype_normalized,demographics_NTA_NYC_residents, on='Neighborhood',how='inner')
callsbytype_attributes_workers=pd.merge(calls_bytype_normalized,demographics_NTA_NYC_workers, on='Neighborhood',how='inner')
print len(callsbytype_attributes_residents), len(callsbytype_attributes_workers)

190 190


In [8]:
types_of_calls=calls_bytype_normalized.columns[:-1][1:]  #types of calls
#calls_bytype_normalized.columns[:-3]

In [9]:
callsbytype_attributes.columns

Index([u'Neighborhood', u'AGENCY', u'APPLIANCE', u'Adopt-A-Basket',
       u'Air Quality', u'Animal Abuse', u'Animal Facility - No Permit',
       u'Animal in a Park', u'Asbestos', u'BEST/Site Safety', 
       ...
       u'household income form 10 to 40_n',
       u'household income form 40 to 75_n', u'household income 75 and above_n',
       u'house value less than 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n', u'rent less than 1000_n',
       u'rent bewteen 1000 and 2000_n', u'Transportation Other means_n',
       u'rent 2000 or more_n'],
      dtype='object', length=247)

###We are defining manually the variables, and the groups of variables

In [10]:
regressors=[u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over',
       u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race',
       u'family households', u'nonfamily households',
       u'population education high school or less', u'population education bachelors',
       u'population education masters', u'population education phd',
        u'owner  occupied units',
       u'renter occupied units', u'transportation car', u'number of cars',
       u'transportation public', u'tranportation motorcycle',
        u'household income less than 40',
       u'household income from 40 to 75', u'household income 75 and above',
       u'house value less than 100', u'house value for 100 to 500',
       u'house value 500 or more', u'rent less than 1000',
       u'rent bewteen 1000 and 2000', u'rent 2000 or more',
       u'Transportation Other means', u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n',
       u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n',
       u'family households_n', u'nonfamily households_n',
       u'population education high school and less_n',
       u'population education bachelors_n', u'population education masters_n',
       u'population education phd_n', u'household income less than 10_n',
       u'owner  occupied units_n', u'renter occupied units_n',
       u'transportation car_n', u'transportation public_n',
       u'tranportation motorcycle_n', 
       u'household income less than 10_n',
       u'household income form 10 to 40_n',
       u'household income form 40 to 75_n', u'household income 75 and above_n',
       u'house value less than 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n', u'rent less than 1000_n',
       u'rent bewteen 1000 and 2000_n',u'rent 2000 or more_n', u'Transportation Other means_n']

In [11]:
#Defining the selected features for the innitial resident lasso regression
#we will use this innitial regression to determine a selected list of types of complaints 
#that meet a certain threadshold to then perform a selective regression by feature types
resident_features = [u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over', u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race', u'family households', u'nonfamily households', 
                     u'population education high school or less', 
       u'population education bachelors', u'population education masters', u'population education phd', 'owner  occupied units',  
       u'renter occupied units', u'transportation car', u'transportation public', 'tranportation motorcycle', 
       u'Transportation Other means', u'number of cars', u'household income less than 40', u'household income from 40 to 75', 
       u'household income 75 and above', u'house value less than 100', u'house value for 100 to 500',
       u'house value 500 or more', u'rent less than 1000', u'rent bewteen 1000 and 2000', u'rent 2000 or more']                     

In [12]:
#This is the list of selected feature types for the second step
age_residents=[u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over']

race_residents=[u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race']

typeof_household_residents=[u'family households', u'nonfamily households']
education_residents= [u'population education high school or less', u'population education bachelors', u'population education masters', 
                      u'population education phd']
ownorrent_residents = ['owner  occupied units',  u'renter occupied units']
transportationtype_residents = [u'transportation car', u'transportation public', 'tranportation motorcycle',
                              'Transportation Other means']
#omit 'number of cars' from the regression 
income_residents = [u'household income less than 40', u'household income from 40 to 75', u'household income 75 and above']
housing_values_residents = [u'house value less than 100', u'house value for 100 to 500',
       u'house value 500 or more']
rent_residents=[u'rent less than 1000',u'rent bewteen 1000 and 2000', u'rent 2000 or more']
variable_groups_residents=['age_residents','race_residents','typeof_household_residents',
                          'education_residents','ownorrent_residents','transportationtype_residents',
                          'income_residents','housing_values_residents','rent_residents']

In [13]:
#This is the list of selected feature types for the second step
age_workers=[u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n']

race_workers=[u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n']

typeof_household_workers=[u'family households_n', u'nonfamily households_n']
education_workers= [u'population education high school and less_n', u'population education bachelors_n', u'population education masters_n', 
                      u'population education phd_n']
ownorrent_workers = ['owner  occupied units_n',  u'renter occupied units_n']
transportationtype_workers = [u'transportation car_n', u'transportation public_n', 'tranportation motorcycle_n',
                              'Transportation Other means_n']
income_workers = [u'household income less than 10_n',u'household income form 10 to 40_n', u'household income form 40 to 75_n', u'household income 75 and above_n']
housing_values_workers = [u'house value less than 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n']       
rent_workers= [ u'rent less than 1000_n',
       u'rent bewteen 1000 and 2000_n', u'rent 2000 or more_n'] 
variable_groups_workers=['age_workers','race_workers','typeof_household_workers','education_workers','ownorrent_workers',
                        'transportationtype_workers','income_workers','housing_values_workers','rent_workers']

#STEP 1


In this step, we will get the R square values for every regression
then, the estimated average of requests by type is going to be obtained

In [14]:
variable_groups=variable_groups_residents+variable_groups_workers

In [15]:
#FIRST, I WILL RUN OLS OVER EVERY TYPE OF COMPLAIN
ols_coefficients_dict={}
for typeof in types_of_calls:
    ols_coefficients_dict[typeof]={}
    #filter population
    for data_group in variable_groups:
        ols_coefficients_dict[typeof][data_group]={}
        features=eval(data_group)           
        A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
        myframe1=callsbytype_attributes[A2].fillna(value=0)  
        X=myframe1[features]
        Y=myframe1[typeof]
        ####CROSS-VALIDATION
        kf = KFold(n=len(myframe1), n_folds=10, shuffle=True, random_state=21)
        r2store=[]
        for train_index, test_index in kf:
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            Y_train, Y_test = Y.loc[train_index], Y.loc[test_index]
            ordinary_LS=sm.OLS(Y_train, statsmodels.tools.add_constant(X_train)).fit()
            pred_test=ordinary_LS.predict(statsmodels.tools.add_constant(X_test))
            if np.var(Y_test)==0:
                vv=0.000003
            else:
                vv=np.var(Y_test)            
            R2_OS = (1 - np.var(pred_test - Y_test) / vv)
            r2store.append(R2_OS)
        rsq=np.mean(r2store)
        #ols_coefficients_dict[typeof][data_group]['coefficients']=ordinary_LS.params
        #ols_coefficients_dict[typeof][data_group]['interval']=ordinary_LS.conf_int()
        ols_coefficients_dict[typeof][data_group]['Rsquared']=rsq

###Table 1

In [16]:
#In order to use it later, I will save two csv files: one with the name of the top features and another one with the values of 
#coefficients of those features.
table_r_sq = pd.DataFrame(index=ols_coefficients_dict.keys(), columns=variable_groups)
for typeof in ols_coefficients_dict.keys():
    for groupof in variable_groups:
        rsq=ols_coefficients_dict[typeof][groupof]['Rsquared']
        table_r_sq.loc[typeof,groupof]=rsq

In [17]:
#table_r_sq

In [18]:
#couples of types of requests and group of coefficients with r-sq>0.2
regresions_filtered={}
for typeof in types_of_calls:
    regresions_filtered[typeof]=[]
    for data_group in variable_groups:
        if table_r_sq.loc[typeof,data_group]>0.2:
            regresions_filtered[typeof].append(data_group)
for typeof in types_of_calls:
    if len(regresions_filtered[typeof])==0:
        regresions_filtered.pop(typeof, None)

In [19]:
regresions_filtered

{'APPLIANCE': ['age_residents',
  'race_residents',
  'ownorrent_residents',
  'transportationtype_residents',
  'income_residents',
  'rent_residents'],
 'Air Quality': ['age_residents',
  'race_residents',
  'typeof_household_residents',
  'education_residents',
  'ownorrent_residents',
  'transportationtype_residents',
  'income_residents',
  'housing_values_residents',
  'rent_residents',
  'age_workers',
  'race_workers',
  'typeof_household_workers',
  'education_workers',
  'ownorrent_workers',
  'transportationtype_workers',
  'income_workers',
  'housing_values_workers',
  'rent_workers'],
 'Animal Abuse': ['age_residents',
  'race_residents',
  'typeof_household_residents',
  'education_residents',
  'ownorrent_residents',
  'transportationtype_residents',
  'income_residents',
  'rent_residents'],
 'Asbestos': ['age_residents',
  'typeof_household_residents',
  'education_residents',
  'ownorrent_residents',
  'transportationtype_residents',
  'income_residents',
  'rent_res

In [20]:
table_1 = pd.DataFrame(index=range(0,len(table_r_sq)), columns=['Type of complain','Group of features','R square'])
i=0
for typeof in regresions_filtered.keys():
    for groupof in regresions_filtered[typeof]:
        table_1.loc[i,'Type of complain']=typeof
        table_1.loc[i,'Group of features']=groupof
        table_1.loc[i,'R square']=ols_coefficients_dict[typeof][groupof]['Rsquared']
        i=i+1

In [21]:
table_1.sort('Type of complain',ascending=True,inplace=True)
table_1.to_csv('outputs/table_1.csv')

In [22]:
table_1.head()

Unnamed: 0,Type of complain,Group of features,R square
147,APPLIANCE,age_residents,0.407965
148,APPLIANCE,race_residents,0.6053267
149,APPLIANCE,ownorrent_residents,0.3906785
150,APPLIANCE,transportationtype_residents,0.3367426
151,APPLIANCE,income_residents,0.5218374


Now, we will train OLS over the entire set, for all the types of requests and groups of features from table 1 (that is, those with satisfactory R2)

In [23]:
coefficients_complete_table=pd.DataFrame(index=np.unique(table_1['Type of complain']), columns=regressors)
coefficients_std_complete_table=pd.DataFrame(index=np.unique(table_1['Type of complain']), columns=regressors)
intercepts_complete = pd.DataFrame(index=regresions_filtered.keys(), columns=variable_groups)  #table storing intercepts of each reg
results_phase1={}
for typeof in regresions_filtered.keys():
    results_phase1[typeof]={}
    for data_group in regresions_filtered[typeof]:
        results_phase1[typeof][data_group]={}
        features=eval(data_group)           
        A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
        myframe1=callsbytype_attributes[A2].fillna(value=0)   
        X=myframe1[features]
        Y=myframe1[typeof]
        ordinary_LS=sm.OLS(Y, statsmodels.tools.add_constant(X)).fit()
        rsq=ordinary_LS.rsquared
        intercepts_complete.loc[typeof,data_group]=ordinary_LS.params['const']
        table=ordinary_LS.conf_int()
        table['std_dv']=(table[1]-table[0])/4
        table['mean']=ordinary_LS.params
        table.drop([0,1],inplace=True,axis=1)
        results_phase1[typeof][data_group]['coefficients']=table
        results_phase1[typeof][data_group]['Rsquared']=rsq
        for feat in features:
            coefficients_complete_table.loc[typeof,feat]=results_phase1[typeof][data_group]['coefficients'].loc[feat,'mean']
            coefficients_std_complete_table.loc[typeof,feat]=results_phase1[typeof][data_group]['coefficients'].loc[feat,'std_dv']

In [24]:
table.loc['const','std_dv']

9.0110188630994372

In [25]:
intercepts_complete.head()

Unnamed: 0,age_residents,race_residents,typeof_household_residents,education_residents,ownorrent_residents,transportationtype_residents,income_residents,housing_values_residents,rent_residents,age_workers,race_workers,typeof_household_workers,education_workers,ownorrent_workers,transportationtype_workers,income_workers,housing_values_workers,rent_workers
Lead,-3.424899,-3.969402,-2.964473,-1.69162,-5.696113,-1.659196,-3.133643,3.534912,0.61056,,,,,,,,,
WATER LEAK,-19.8395,-28.6361,,,75.46134,89.52429,-19.90421,,-21.75723,,,,,,,,,
Bus Stop Shelter Placement,,-0.2899938,-0.263197,-0.2490972,-0.3741665,-0.1293626,-0.2847463,,0.1144777,,,,,,,,,
Found Property,0.2738597,,1.247724,0.699945,,0.1649616,-0.379787,,0.3296866,0.6197514,0.3514211,0.9698874,0.1529951,0.1975857,0.5729787,0.6148652,0.6393154,0.8215352
For Hire Vehicle Complaint,3.197925,2.395344,4.542494,,2.356012,5.633116,2.365539,,4.524034,9.64337,9.008913,10.15124,8.897229,9.084633,9.163976,9.45734,9.798377,9.915632


In [26]:
coefficients_std_complete_table.head()

Unnamed: 0,Population under 18,population between 18 and 34,population between 35 to 64,population 65 and over,Population white,population black,Population asian,population hispanic,population other race,family households,...,household income form 10 to 40_n,household income form 40 to 75_n,household income 75 and above_n,house value less than 100_n,house value for 100 to 500_n,house value 500 or more_n,rent less than 1000_n,rent bewteen 1000 and 2000_n,rent 2000 or more_n,Transportation Other means_n
APPLIANCE,0.0009906228,0.001102199,0.001489496,0.002424391,0.0001925297,0.0002109733,0.0003956132,0.0002295187,0.001592293,,...,,,,,,,,,,
Air Quality,0.000674528,0.0007505017,0.001014217,0.001650799,0.0001858109,0.0002036108,0.0003818072,0.000221509,0.001536726,0.0004830335,...,0.0049308,0.004920118,0.0005768722,0.007316152,0.001009146,0.0005038097,0.002245258,0.00126658,0.0006723099,0.002924976
Animal Abuse,0.0005361694,0.0005965594,0.0008061819,0.001312189,0.0001257625,0.00013781,0.0002584188,0.0001499241,0.001040103,0.0004760381,...,,,,,,,,,,
Asbestos,0.0002149184,0.0002391251,0.0003231503,0.0005259783,,,,,,0.0001523643,...,,,,,,,,,,0.001007014
Bike/Roller/Skate Chronic,6.640952e-05,7.388938e-05,9.985305e-05,0.0001625267,,,,,,4.541859e-05,...,,,,,,,,,,


In [27]:
#coefficients_complete_table.to_csv('outputs/step1_coefficients_complete_table.csv')
#coefficients_std_complete_table.to_csv('outputs/coefficients_std_complete_table.csv')

In [28]:
results_phase1['APPLIANCE']['age_residents']['coefficients']

Unnamed: 0,std_dv,mean
const,7.595736,-4.625217
Population under 18,0.000991,0.005378
population between 18 and 34,0.001102,0.00358
population between 35 to 64,0.001489,-0.000436
population 65 and over,0.002424,-0.004956


###For the visualization tool, the results on dictionary 'results_phase1' can be stored as the user profile in a csv file

In [29]:
user_profile_dict={}
for typeof in results_phase1.keys():
    user_profile_dict[typeof]={}
    for groupof in results_phase1[typeof].keys():
        table=results_phase1[typeof][groupof]['coefficients']
        max_feature=table[table['mean']==np.max(table['mean'])].index[0]
        max_coefficient=table[table['mean']==np.max(table['mean'])]['mean'][0]
        user_profile_dict[typeof][groupof]={max_feature:max_coefficient}

user_profiles = pd.DataFrame(index=results_phase1.keys(), columns=variable_groups)
user_profiles_coef = pd.DataFrame(index=results_phase1.keys(), columns=variable_groups)

for typeof in user_profile_dict.keys():
    for groupof in user_profile_dict[typeof].keys():
        user_feature=user_profile_dict[typeof][groupof].keys()[0]
        user_coeff=user_profile_dict[typeof][groupof].values()[0]
        user_profiles.loc[typeof,groupof]=user_feature
        user_profiles_coef.loc[typeof,groupof]=user_coeff

In [30]:
user_profiles

Unnamed: 0,age_residents,race_residents,typeof_household_residents,education_residents,ownorrent_residents,transportationtype_residents,income_residents,housing_values_residents,rent_residents,age_workers,race_workers,typeof_household_workers,education_workers,ownorrent_workers,transportationtype_workers,income_workers,housing_values_workers,rent_workers
Lead,population between 18 and 34,Population white,nonfamily households,population education masters,owner occupied units,tranportation motorcycle,household income 75 and above,const,const,,,,,,,,,
WATER LEAK,Population under 18,population black,,,const,const,household income less than 40,,rent less than 1000,,,,,,,,,
Bus Stop Shelter Placement,,Population white,nonfamily households,population education masters,owner occupied units,transportation public,household income from 40 to 75,,const,,,,,,,,,
Found Property,const,,const,const,,const,household income 75 and above,,const,const,const,const,const,const,const,const,const,const
For Hire Vehicle Complaint,const,const,const,,const,const,const,,const,const,const,const,const,const,const,const,const,const
New Tree Request,,,,,,const,,,,,,,,,,,,
Unsanitary Pigeon Condition,population 65 and over,,,,owner occupied units,,household income from 40 to 75,,,,,,,,,,,
Consumer Complaint,const,const,const,const,const,const,const,,const,const,const,const,const,const,const,const,const,const
Other Enforcement,const,const,const,population education bachelors,owner occupied units,tranportation motorcycle,const,const,const,,,,const,,const,,,
General Construction/Plumbing,const,,const,const,const,const,const,const,const,,,,,,const,,,


In [31]:
#user_profiles.to_csv('outputs/user_profiles.csv')
#user_profiles_coef.to_csv('outputs/user_profiles_coefficients.csv')

The coefficients obtained from the regression follow the formula:

$$S_q (A,t) = \sum_{k=1}^{s_q} P_q (A,k) x_{q,k,t} + \sum_{k=1}^{s_q} C_q (A,k) y_{q,k,t}$$

where $q$ represents each group of variables and $k$ is each variable, $x_{q,k,t}$ and $y_{q,k,t}$ are the coefficients obtained from the regressions.

The next step, consists on running the OLS model for the variables with sufficient R square values using the entire set as training and getting a prediction $\hat{S_q (A,t) }$ that is, a prediction for the total number of service requests of a certain type for each area based on the cathegory $q$


In [32]:
OLS_regressions={}
for typeof in regresions_filtered.keys():
    OLS_regressions[typeof]={}
    for data_group in regresions_filtered[typeof]:
        OLS_regressions[typeof][data_group]={}
        features=eval(data_group)           
        A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
        myframe1=callsbytype_attributes[A2].fillna(value=0)  
        X=myframe1[features]
        Y=myframe1[typeof]
        ordinary_LS=sm.OLS(Y, statsmodels.tools.add_constant(X)).fit()
        predicted_vals=ordinary_LS.predict( statsmodels.tools.add_constant(X))
        predictions_frame= pd.DataFrame(index=range(0,len(X)), columns=['NTA','observed_req','predicted_req'])
        predictions_frame['NTA']=myframe1['Neighborhood']
        predictions_frame['observed_req']=Y
        predictions_frame['predicted_req']=list(predicted_vals)
        OLS_regressions[typeof][data_group]['prediction']=predictions_frame
        total_var=np.sum(results_phase1[typeof][data_group]['coefficients'].std_dv)
        OLS_regressions[typeof][data_group]['total_var']=total_var

In [33]:
OLS_regressions['APPLIANCE']['age_residents']['total_var']

7.601743137972074

In [34]:
OLS_regressions['APPLIANCE']['age_residents']['prediction'].head()

Unnamed: 0,NTA,observed_req,predicted_req
0,Allerton-Pelham Gardens,13,26.2158
1,Annadale-Huguenot-Prince's Bay-Eltingville,4,22.272673
2,Arden Heights,3,22.542182
3,Astoria,51,87.228749
4,Auburndale,6,11.439694


####NEXT STEP: average those estimates over all suitable q

that follows the formula: $$\hat{S} (A,t) = \frac{\sum_{q} S_q (A,t) \sigma _{q,t} ^{-2}}{\sum_{q} \sigma _{q,t} ^{-2}}$$

In [35]:
avg_predictions_frame= pd.DataFrame(index=range(0,len(X)), columns=['NTA','observed_req','predicted_req'])

In [36]:
avg_predicionts_dict={}
for typeof in regresions_filtered.keys():
    avg_predicionts_dict[typeof]={}
    vars_list=[]
    avg_predictions_frame= pd.DataFrame(index=range(0,len(X)), columns=['NTA','observed_req','predicted_req'])
    temporal_frame= pd.DataFrame(index=range(0,len(OLS_regressions[typeof][OLS_regressions[typeof].keys()[0]]['prediction'])))
    temporal_frame['NTA']=OLS_regressions[typeof][OLS_regressions[typeof].keys()[0]]['prediction'].NTA
    avg_predictions_frame['NTA']=temporal_frame['NTA']
    avg_predictions_frame['observed_req']=OLS_regressions[typeof][OLS_regressions[typeof].keys()[0]]['prediction'].observed_req
    for data_group in regresions_filtered[typeof]:
        my_var=OLS_regressions[typeof][data_group]['total_var']
        if my_var==0:
            std_e=0.00001
        else:
            std_e=(my_var**-2)
        vars_list.append(std_e)
        predicted_weighted=(std_e)*OLS_regressions[typeof][data_group]['prediction'].predicted_req
        temporal_frame[data_group]=predicted_weighted
    avg_predictions_frame['predicted_req']=(temporal_frame.sum(axis=1))/np.sum(vars_list)
    avg_predicionts_dict[typeof]=avg_predictions_frame
#predicted_av=predicted_weighted/np.sum(vars_list)
        

In [37]:
observed_table=pd.DataFrame(index=avg_predicionts_dict.keys(),columns=callsbytype_attributes['Neighborhood'])
predicted_table=pd.DataFrame(index=avg_predicionts_dict.keys(),columns=callsbytype_attributes['Neighborhood'])
for typeof in avg_predicionts_dict.keys():
    table=avg_predicionts_dict[typeof]
    for nta in callsbytype_attributes['Neighborhood']:
        observed_table.loc[typeof,nta]=table[table.NTA==nta].observed_req.sum()
        predicted_table.loc[typeof,nta]=table[table.NTA==nta].predicted_req.sum()
predicted_table.head()

Neighborhood,Allerton-Pelham Gardens,Annadale-Huguenot-Prince's Bay-Eltingville,Arden Heights,Astoria,Auburndale,Baisley Park,Bath Beach,Battery Park City-Lower Manhattan,Bay Ridge,Bayside-Bayside Hills,...,Williamsbridge-Olinville,Williamsburg,Windsor Terrace,Woodhaven,Woodlawn-Wakefield,Woodside,Yorkville,park-cemetery-etc-Bronx,park-cemetery-etc-Manhattan,park-cemetery-etc-Queens
Lead,5.475991,6.842618,4.953731,25.09426,4.912213,4.805106,7.120284,19.38802,26.10169,12.53872,...,9.760725,3.017439,7.200958,10.13378,7.956403,10.07822,39.36168,-1.384721,0.1956318,-1.612291
WATER LEAK,69.61976,14.78282,9.577409,355.7771,21.70737,94.4076,90.79288,34.97598,279.0632,41.98633,...,276.839,120.499,57.18773,168.5001,148.4301,155.6083,185.2659,11.89378,59.3862,3.65829
Bus Stop Shelter Placement,0.5977712,0.6180732,0.5372178,2.679893,0.3816934,0.6971881,0.7012735,1.779416,2.695809,1.041111,...,1.418393,0.5261218,0.6786361,1.198348,1.056407,1.080223,3.609504,-0.14962,0.0770828,-0.1835897
Found Property,0.7060287,0.1434934,0.3980001,1.838258,0.3967647,0.408955,0.5590149,11.84234,1.173234,0.3158689,...,0.7877428,1.696356,0.8919431,0.6508016,0.6472989,1.135871,3.96747,0.5766002,0.9781845,0.6356916
For Hire Vehicle Complaint,10.0214,8.533202,8.930713,13.65801,9.000638,9.374346,9.685883,38.22354,12.67439,9.424493,...,11.06937,12.3641,10.14701,10.77331,10.14482,11.61761,17.68093,9.086463,10.29408,9.386829


In [38]:
predicted_table.to_csv('outputs/predicted_values.csv')
observed_table.to_csv('outputs/observed_values.csv')

#Finally, I will get the coefficients table with intercepts

In [39]:
coefficients_complete_table['Intercept']=0
coefficients_complete_table.head()

Unnamed: 0,Population under 18,population between 18 and 34,population between 35 to 64,population 65 and over,Population white,population black,Population asian,population hispanic,population other race,family households,...,household income form 40 to 75_n,household income 75 and above_n,house value less than 100_n,house value for 100 to 500_n,house value 500 or more_n,rent less than 1000_n,rent bewteen 1000 and 2000_n,rent 2000 or more_n,Transportation Other means_n,Intercept
APPLIANCE,0.00537846,0.003580274,-0.0004355563,-0.00495601,0.000496655,0.002572719,-0.0006020764,0.002748059,-0.001625249,,...,,,,,,,,,,0
Air Quality,-0.004851218,0.006096357,-0.00101451,0.004700051,0.002004975,0.0002255966,-0.0003220464,0.0006063961,0.002683214,-0.00183576,...,0.005465458,-0.001398754,0.03283055,-0.004601778,0.000778048,0.008879626,-0.004033382,-1.187211e-05,-0.01600852,0
Animal Abuse,0.001266405,0.00167845,0.000924979,-0.001756798,0.00065196,0.001153309,4.540494e-05,0.001206872,0.002207494,0.003634009,...,,,,,,,,,,0
Asbestos,-0.001008988,0.001128073,0.0002755111,0.0007973101,,,,,,-5.539743e-05,...,,,,,,,,,-0.003415106,0
Bike/Roller/Skate Chronic,-0.0004258035,0.0005948403,-0.0002761972,0.0006453607,,,,,,-0.0003277945,...,,,,,,,,,,0


In [40]:

intercepts_complete.head()

Unnamed: 0,age_residents,race_residents,typeof_household_residents,education_residents,ownorrent_residents,transportationtype_residents,income_residents,housing_values_residents,rent_residents,age_workers,race_workers,typeof_household_workers,education_workers,ownorrent_workers,transportationtype_workers,income_workers,housing_values_workers,rent_workers
Lead,-3.424899,-3.969402,-2.964473,-1.69162,-5.696113,-1.659196,-3.133643,3.534912,0.61056,,,,,,,,,
WATER LEAK,-19.8395,-28.6361,,,75.46134,89.52429,-19.90421,,-21.75723,,,,,,,,,
Bus Stop Shelter Placement,,-0.2899938,-0.263197,-0.2490972,-0.3741665,-0.1293626,-0.2847463,,0.1144777,,,,,,,,,
Found Property,0.2738597,,1.247724,0.699945,,0.1649616,-0.379787,,0.3296866,0.6197514,0.3514211,0.9698874,0.1529951,0.1975857,0.5729787,0.6148652,0.6393154,0.8215352
For Hire Vehicle Complaint,3.197925,2.395344,4.542494,,2.356012,5.633116,2.365539,,4.524034,9.64337,9.008913,10.15124,8.897229,9.084633,9.163976,9.45734,9.798377,9.915632


In [41]:
for typeof in intercepts_complete.index:
    std_dv_array=[]
    sum_acc=0
    for groupof in intercepts_complete.columns:
        if (groupof in results_phase1[typeof].keys())==True:
            const_row=results_phase1[typeof][groupof]['coefficients'].loc['const']
            mn=const_row['mean']
            std_dv=const_row['std_dv']
            std_dv_array.append(std_dv**(-2))
            sum_acc=sum_acc + mn*(std_dv**(-2))
    myresult=sum_acc/np.sum(std_dv_array)
    coefficients_complete_table.loc[typeof,'Intercept']=myresult

In [42]:
coefficients_complete_table

Unnamed: 0,Population under 18,population between 18 and 34,population between 35 to 64,population 65 and over,Population white,population black,Population asian,population hispanic,population other race,family households,...,household income form 40 to 75_n,household income 75 and above_n,house value less than 100_n,house value for 100 to 500_n,house value 500 or more_n,rent less than 1000_n,rent bewteen 1000 and 2000_n,rent 2000 or more_n,Transportation Other means_n,Intercept
APPLIANCE,0.00537846,0.003580274,-0.0004355563,-0.00495601,0.000496655,0.002572719,-0.0006020764,0.002748059,-0.001625249,,...,,,,,,,,,,3.863394
Air Quality,-0.004851218,0.006096357,-0.00101451,0.004700051,0.002004975,0.0002255966,-0.0003220464,0.0006063961,0.002683214,-0.00183576,...,0.005465458,-0.001398754,0.03283055,-0.004601778,0.000778048,0.008879626,-0.004033382,-1.187211e-05,-0.01600852,18.921221
Animal Abuse,0.001266405,0.00167845,0.000924979,-0.001756798,0.00065196,0.001153309,4.540494e-05,0.001206872,0.002207494,0.003634009,...,,,,,,,,,,13.490711
Asbestos,-0.001008988,0.001128073,0.0002755111,0.0007973101,,,,,,-5.539743e-05,...,,,,,,,,,-0.003415106,3.362554
Bike/Roller/Skate Chronic,-0.0004258035,0.0005948403,-0.0002761972,0.0006453607,,,,,,-0.0003277945,...,,,,,,,,,,0.681230
Blocked Driveway,0.01556686,-0.02428145,0.05693827,-0.07400646,0.002175886,0.00937395,0.02236714,0.007626867,0.04099607,0.06697274,...,,,,,,,,,,26.449722
Boilers,0.0001480229,0.0008739305,-0.0001273841,9.752724e-05,0.0002416136,0.00025472,-0.0001121938,0.0004192862,0.000200872,0.0005212926,...,,,,,,,,,,-0.572571
Broken Muni Meter,,,,,,,,,,,...,-0.01630243,-0.002019203,0.2214947,-0.03158586,0.00570579,,,,,70.426755
Broken Parking Meter,,,,,,,,,,,...,,,,,,,,,,-0.323738
Building/Use,,,,,,,,,,,...,,,,,,,,,,-9.635956


In [43]:
coefficients_complete_table.to_csv('outputs/step1_coefficients_complete_table.csv')
coefficients_std_complete_table.to_csv('outputs/coefficients_std_complete_table.csv')