In [1]:
import pandas as pd 
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
#import pylab as plt
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import statsmodels.api as sm
%matplotlib inline

###This notebook will focus on the analytical part of the 311 demographics analysis: that is, multilinear regressions between resident and working population socio-demographic attributes and  the number of 311 calls by type per capita, at the NTA level will be performed and analyzed

In [35]:
#Upload the working population attributes, and the resident population attributes at the CT level.
#We keep both datasets separated in order to generate separate regressions per type of features
demographics_NTA_NYC_residents=pd.read_csv('demographics_nta_NYC_residents_compiled.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
demographics_NTA_NYC_workers=pd.read_csv('demographics_nta_NYC_workers_compiled.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
print len(demographics_NTA_NYC_residents),len(demographics_NTA_NYC_workers)

195 194


In [36]:
#Upload the 311 calls by type, normalized by total resident population
calls_bytype_normalized=pd.read_csv('Call by type with normalization by resident - NTA level.csv').drop('Unnamed: 0', axis=1)

In [37]:
#Delete the total population from the dataset since we are not going to use it
del(calls_bytype_normalized['Total Population'])

In [38]:
#callsbytype_attributes will be a dataframe combining all the information (demographics + calls by type)
#we created to separate frames, one for residents and one for workers
callsbytype_attributes_residents=pd.merge(calls_bytype_normalized,demographics_NTA_NYC_residents, on='Neighborhood',how='inner')
callsbytype_attributes_workers=pd.merge(calls_bytype_normalized,demographics_NTA_NYC_workers, on='Neighborhood',how='inner')
print len(callsbytype_attributes_residents), len(callsbytype_attributes_workers)

190 190


In [39]:
types_of_calls=calls_bytype_normalized.columns[:-1]  #types of calls

In [8]:
#Defining the selected features for the innitial resident lasso regression
#we will use this innitial regression to determine a selected list of types of complaints 
#that meet a certain threadshold to then perform a selective regression by feature types
resident_features = [u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over', u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race', u'family households', u'nonfamily households', u'population education high school', 
       u'population education bachelors', u'population education masters', u'population education phd', 'owner  occupied units',  
       u'renter occupied units', u'transportation car', u'transportation public', 'tranportation motorcycle', 
       u'Transportation Other means', u'number of cars', u'household income form 10 to 40', u'household income form 40 to 75', 
       u'household income 75 and above', u'house value for 20 to 100', u'house value for 100 to 500',
       u'house value 500 or more', u'rent bewteen 300 and 1000', u'rent bewteen 1000 and 2000', u'rent 2000 or more']                     

In [9]:
#This is the list of selected feature types for the second step
age_residents=[u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over']

race_residents=[u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race']

typeof_household_residents=[u'family households', u'nonfamily households']
education_residents= [u'population education high school', u'population education bachelors', u'population education masters', 
                      u'population education phd']
ownorrent_residents = ['owner  occupied units',  u'renter occupied units']
transportationtype_residents = [u'transportation car', u'transportation public', 'tranportation motorcycle',
                              'Transportation Other means', u'number of cars']
income_residents = [u'household income form 10 to 40', u'household income form 40 to 75', u'household income 75 and above']
housing_values_residents = [u'house value for 20 to 100', u'house value for 100 to 500',
       u'house value 500 or more', u'rent bewteen 300 and 1000',
       u'rent bewteen 1000 and 2000', u'rent 2000 or more']        

Lets Consider different groups of population $g=1,2,…,n$ (based on our demographic indicators) and let:


$Pr(a,g)$ - the total number of residents in the location $a$ of group $g$ 

while $Pc(a,g)$ the number of commuters.
 
Let the unknown (subject to fit) complaining behavior be defined by the average number of complains of type $t$ per resident of group $g$ within his/her place of residency be $rc(g,t)$

Let also, $wc(g,t)$ be the number of complains of type $t$ per commuter of type $g$.

Then the total observed number of complains of type $t$ in the area $a$ is:

$$C(a,t)=\sum_{g,t} Pr(a,g) \ rc(g,t) + \sum_{g,t} Pc(a,g) \ wc(g,t) \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  \text{(1)}$$ 

Then we know $Pr(a,g)$ and $Pc(a,g)$ (those are our regressors), we know the output variable $C(a,t)$ from 311 statistics. We need to fit the $rc(g,t)$, $wc(g,t)$ - slope coefficients of the multivariate linear regression.

This will give us complaining behavior per people of different groups and it will be distinguished by the complaining mode - while at home and while on the way.



###we will procced as follows:

STEP 1) Lasso regression:

Regressors:  

$Pr(a,g)$ - the total number of residents in the location $a$ of group $g$.
            
$Pc(a,g)$  number of commuters in the location a of group $g$

Target variable to be fit: $rc(g,t)$ -   average number of complains of type $t$ per resident of group $g$                                                      within his/her place of residency 

STEP 2) predict the number of complains per capita $wc(g,t)$ from the results of step 1, using equation $(1)$

Using the predicted value $rc(g,t)$ in each area, we are able to get a $wc(g,t)$ prediction (from the formula of the observed total calls by type $C(a,t)$ variable) 

In [10]:
#First set of regressions using all the demographic features for residents and calls by type
#we are defining an innitial threadshold on the total number of calls per type of call
#we request that the normalized number of calls is greater than the mean for all types.
results={}
for typeof in types_of_calls:
    A2=np.append(np.append(resident_features,typeof),'Neighborhood')   #selection of columns
    myframe1=callsbytype_attributes_residents[A2].dropna() 
    if myframe1[typeof].sum() > calls_bytype_normalized.total_calls.mean() :
        results[typeof]={}
        X=myframe1[resident_features]
        Y=myframe1[typeof]
        #LASSO
        X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
        X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
        R2_store=[]
        for i in range(-40,40):
            Lasso = linear_model.Lasso(fit_intercept=True,alpha=i)
            Lasso.fit(X_train,Y_train)
            R2_OS = 1 - np.var(Lasso.predict(X_val) - Y_val) / np.var(Y_val)
            R2_store.append(R2_OS)
        optim_alpha=range(-40,40)[np.where(R2_store==np.max(R2_store))[0][0]]
        results[typeof]['best_alpha']=optim_alpha
        Lasso = linear_model.Lasso(fit_intercept=True,alpha=optim_alpha)
        Lasso.fit(X_train,Y_train)
        R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
        results[typeof]['best_R2']=R2_OS
        results[typeof]['sample']=len(myframe1)
        results[typeof]['normalized calls'] = myframe1[typeof].sum()
    else:
        pass       

  positive)


In [11]:
#We create a frame to pass the results of the regressions
results_frame = pd.DataFrame(index=range(0,len(results.keys())), columns=['type','best_alpha','R2','sample_size', 'Normalized calls'])

In [12]:
#And pass the results (stored as a dictionary) into the frame
results_frame['type']=results.keys()
results_frame.head()
for i in results_frame.index:
    typeof=results_frame.loc[i,'type']
    results_frame.iloc[i,1]=results[typeof]['best_alpha']
    results_frame.iloc[i,2]=results[typeof]['best_R2']  
    results_frame.iloc[i,3]=results[typeof]['sample']
    results_frame.iloc[i,4]=results[typeof]['normalized calls']    

In [13]:
#Sorting the frame by the R2 results in a ascending form
results_frame.sort('R2',ascending=False,inplace=True)

  from ipykernel import kernelapp as app


In [14]:
#Displaying the frame; we will use this list of types of calls for the second step
results_frame 

Unnamed: 0,type,best_alpha,R2,sample_size,Normalized calls
34,FLOORING/STAIRS,0,0.6173,187,0.442795
28,Noise - Residential,1,0.607514,190,4.23941
10,Noise,0,0.58789,190,1.12104
7,UNSANITARY CONDITION,1,0.556622,188,1.3218
19,PLUMBING,1,0.552888,188,1.33045
36,PAINT/PLASTER,1,0.550318,188,1.32788
6,HEAT/HOT WATER,1,0.521671,190,2.77999
23,Building/Use,1,0.491954,188,0.626404
29,DOOR/WINDOW,0,0.477424,186,0.612973
2,WATER LEAK,1,0.475575,188,0.612883


In [15]:
#We create a new fraame subsetting the results one, selecting only the types of complaints
#with a R2 positive result
top_R2=results_frame[results_frame.R2>=0]
top_R2 = top_R2.reset_index(drop=True)

In [16]:
top_R2

Unnamed: 0,type,best_alpha,R2,sample_size,Normalized calls
0,FLOORING/STAIRS,0,0.6173,187,0.442795
1,Noise - Residential,1,0.607514,190,4.23941
2,Noise,0,0.58789,190,1.12104
3,UNSANITARY CONDITION,1,0.556622,188,1.3218
4,PLUMBING,1,0.552888,188,1.33045
5,PAINT/PLASTER,1,0.550318,188,1.32788
6,HEAT/HOT WATER,1,0.521671,190,2.77999
7,Building/Use,1,0.491954,188,0.626404
8,DOOR/WINDOW,0,0.477424,186,0.612973
9,WATER LEAK,1,0.475575,188,0.612883


In [17]:
#We then proceed to re run lasso, only with the selected types of requests
#using only a selected group of features, in this case age
#and creating a frame to store the resulting cofficients by feature
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(age_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[age_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(age_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=age_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)

lasso_coefficients_age = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_age



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,Population under 18,0.0,-8.930901e-08,0.0,-5.278336e-08,3.238429e-07,0.0,0.0,0.0,0.0,...,6.982326e-07,7.075701e-07,0.0,-1.050657e-07,0.0,-1.389717e-09,-4.717656e-07,6.615595e-07,2.499854e-07,-1e-06
1,population 65 and over,-9.945644e-07,0.0,0.0,-1.14183e-07,-3.523755e-07,0.0,0.0,0.0,0.0,...,-7.75477e-07,-7.130643e-07,0.0,-3.413896e-07,0.0,0.0,0.0,-4.312124e-07,-2.029068e-07,-1.4e-05
2,population between 18 and 34,-4.593221e-07,0.0,-7.017873e-08,3.549444e-08,2.485794e-07,-9.156741e-08,-3.562998e-08,-5.689704e-08,-2.176532e-08,...,1.705973e-07,1.889629e-07,1.257509e-07,-1.876113e-07,-1.196145e-07,-7.055597e-08,2.1911e-07,2.459253e-07,0.0,2e-06
3,population between 35 to 64,7.60635e-07,0.0,7.110786e-08,5.020019e-08,-1.416407e-07,0.0,0.0,0.0,0.0,...,-2.042394e-07,-2.352516e-07,-1.014196e-07,2.477537e-07,0.0,0.0,-3.823926e-08,-3.342294e-07,-3.457859e-08,1e-06


In [18]:
#We repeat the same procedure with race features
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(race_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[race_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(race_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=race_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)

lasso_coefficients_race = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_race



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,Population asian,2.269507e-07,0.0,7.909195e-08,2.684366e-08,-1.015518e-07,0.0,0.0,0.0,0.0,...,-1.814271e-07,-1.726773e-07,-6.729487e-08,-7.034656e-09,0.0,0.0,-2.937698e-08,-1.615191e-07,-6.250053e-08,-2.851088e-06
1,Population white,-9.354442e-08,1.234897e-07,-5.503425e-08,-3.551135e-09,-2.955096e-08,0.0,6.45154e-09,0.0,-1.927393e-08,...,-9.923579e-08,-9.695571e-08,0.0,-1.557958e-08,0.0,0.0,0.0,-7.72676e-08,-4.186788e-08,-1.239572e-06
2,population black,4.499836e-09,-4.598506e-08,-1.944537e-08,-5.857022e-09,4.84536e-08,-5.985186e-09,-1.398974e-09,0.0,1.983291e-08,...,6.084073e-08,1.016157e-07,0.0,4.911372e-09,0.0,-1.54142e-08,-8.396319e-08,7.682067e-08,4.14223e-08,-5.675568e-07
3,population hispanic,6.197665e-08,-7.874524e-08,-2.78138e-08,3.835016e-09,1.160753e-07,-8.147403e-08,-4.127882e-08,-4.2634e-08,-8.694848e-09,...,2.023014e-07,1.750605e-07,2.861835e-08,-5.458773e-08,-9.621329e-08,-4.689902e-08,-1.306119e-07,1.708009e-07,7.709989e-08,1.420569e-08
4,population other race,2.906815e-07,0.0,7.798474e-07,7.072451e-08,1.171117e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.249123e-08,0.0,0.0,0.0,0.0,0.0,3.360614e-06


In [19]:
#We repeat the same procedure with type of household features
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(typeof_household_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[typeof_household_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(typeof_household_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=typeof_household_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)

lasso_coefficients_typeof_household = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_typeof_household



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,family households,5.196505e-07,0.0,8.260876e-08,-1.334774e-08,1.073116e-07,0.0,0.0,0.0,0.0,...,9.487774e-09,9.125947e-09,-3.835316e-08,3.284026e-09,0.0,-2.973226e-08,-5.790965e-07,1.507152e-08,0.0,-5e-06
1,nonfamily households,-4.990497e-07,2.958718e-07,-1.402846e-07,5.741159e-08,3.675429e-08,-3.764344e-08,0.0,-4.485632e-08,-5.573499e-08,...,0.0,0.0,2.896802e-08,-9.181384e-08,-6.085801e-08,-2.546977e-08,4.124144e-07,0.0,0.0,1e-06


In [20]:
#We repeat the same procedure with education levels  
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(education_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[education_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(education_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=education_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_education = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_education



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,population education bachelors,0.0,2.947763e-07,-4.002136e-08,1.822192e-07,-1.399126e-07,0.0,0.0,0.0,-4.200915e-08,...,-3.085825e-07,-2.997973e-07,0.0,-4.09431e-08,0.0,0.0,1.753912e-07,-2.647742e-07,-1.015426e-07,4e-06
1,population education high school,6.178494e-07,-1.265499e-07,1.332577e-07,-3.184174e-08,2.367085e-07,0.0,0.0,0.0,1.686023e-08,...,2.45445e-08,3.830178e-08,-3.556215e-08,-9.749594e-10,0.0,-1.499675e-08,-5.031984e-07,3.923098e-08,0.0,-5e-06
2,population education masters,-3.805191e-07,0.0,0.0,-2.595743e-07,-3.714614e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.069222e-07,0.0,0.0,0.0,0.0,0.0,-1.1e-05
3,population education phd,0.0,0.0,0.0,4.403008e-08,2.112875e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.18157e-06,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#We repeat the same procedure with type of resident (owner vs renter) 
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(ownorrent_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[ownorrent_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(ownorrent_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=ownorrent_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_ownorrent = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_ownorrent



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,owner occupied units,1.544335e-07,0.0,1.183825e-07,-3.840142e-08,-4.606417e-07,6.522705e-08,7.291733e-08,0.0,0.0,...,-9.940485e-07,-9.136723e-07,-1.311642e-07,9.670246e-08,0.0,9.487905e-09,-3.264e-08,-8.486262e-07,-3.653909e-07,-7.981502e-06
1,renter occupied units,-8.116522e-08,2.971257e-08,-8.750511e-08,4.845951e-08,1.996565e-07,-1.367065e-07,-7.111565e-08,-1.05246e-07,-4.032408e-08,...,2.991655e-07,2.812409e-07,4.641035e-08,-1.02705e-07,-1.904112e-07,-8.035305e-08,0.0,2.688616e-07,1.176725e-07,6.761064e-07


In [22]:
#We repeat the same procedure with transportation type for residents 
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(transportationtype_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[transportationtype_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(transportationtype_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=transportationtype_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_transportationtype = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_transportationtype



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,Transportation Other means,-8.843289e-07,3.150275e-07,0.0,1.585566e-07,-5.237917e-07,0.0,0.0,0.0,-6.225841e-09,...,-6.382712e-07,-5.195703e-07,0.0,-4.570725e-08,0.0,0.0,5.639161e-07,-4.93658e-07,-2.521185e-08,-4.706301e-06
1,number of cars,0.0,0.0,0.0,-5.480822e-07,1.603819e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.803727e-07,0.0,0.0,0.0,0.0,0.0,6.195972e-07
2,tranportation motorcycle,0.0,0.0,0.0,-1.131487e-05,-3.716544e-05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.73151e-06,0.0,0.0,0.0,0.0,0.0,0.0
3,transportation car,3.166986e-07,0.0,2.558025e-07,4.151185e-07,-6.231245e-07,9.610404e-08,8.644191e-08,6.335372e-08,8.927997e-09,...,-8.793122e-07,-8.011174e-07,-1.560995e-07,-5.272374e-07,1.027686e-07,1.691207e-08,-3.968668e-07,-7.541658e-07,-2.800789e-07,-9.221666e-06
4,transportation public,2.60334e-07,0.0,-1.640667e-08,1.084019e-08,2.948554e-07,-7.917151e-08,-4.256714e-08,-6.659555e-08,-1.564363e-08,...,2.550711e-07,2.195035e-07,1.382353e-08,-5.108344e-08,-1.333195e-07,-6.000087e-08,-1.033108e-07,2.165067e-07,4.774937e-08,7.472419e-07


In [23]:
#We repeat the same procedure with income levels 
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(income_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[income_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(income_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=income_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_income = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_income



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,household income 75 and above,-5.40139e-07,3.495196e-07,-3.619368e-09,6.6957e-08,-1.83697e-07,0.0,0.0,0.0,-2.785469e-08,...,-5.629686e-07,-5.210452e-07,0.0,-1.262546e-07,0.0,0.0,1.952299e-07,-4.784784e-07,-2.026457e-07,-1e-06
1,household income form 10 to 40,-7.523954e-07,0.0,-5.701433e-10,2.112986e-07,8.908542e-07,0.0,0.0,0.0,0.0,...,9.330098e-07,8.745939e-07,0.0,-5.793388e-07,0.0,-1.120431e-07,-3.571552e-07,8.117085e-07,3.226801e-07,4e-06
2,household income form 40 to 75,2.151623e-06,0.0,0.0,-2.687325e-07,-4.902549e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.020838e-07,0.0,0.0,0.0,0.0,0.0,-1.1e-05


In [25]:
#Finally we repeat it for housing values  
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(housing_values_residents, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_residents[A2].dropna() 
    X=myframe1[housing_values_residents]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(housing_values_residents)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=housing_values_residents
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_housing_values = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_housing_values



request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK,total_calls
0,house value 500 or more,0.0,0.0,0.0,-9.342912e-08,-3.369642e-07,0.0,0.0,0.0,0.0,...,-9.399455e-07,-8.915244e-07,-6.400767e-09,5.906147e-08,0.0,0.0,0.0,-8.135118e-07,-3.120906e-07,-6e-06
1,house value for 100 to 500,1.158093e-07,0.0,1.444345e-07,-1.798678e-08,-4.651799e-07,0.0,0.0,0.0,0.0,...,-7.635463e-07,-5.374152e-07,-8.894885e-08,1.794756e-07,0.0,0.0,-8.783264e-08,-5.349025e-07,-1.620469e-07,-1e-05
2,house value for 20 to 100,0.0,0.0,0.0,-1.589844e-08,4.524868e-08,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.252616e-06,0.0,0.0,0.0,0.0,0.0,-2.4e-05
3,rent 2000 or more,-5.99218e-07,6.510951e-07,-6.851876e-08,1.483605e-07,-1.416826e-07,0.0,0.0,0.0,-4.751063e-08,...,-4.180953e-08,0.0,0.0,-9.488525e-08,0.0,0.0,6.615929e-07,0.0,0.0,1e-06
4,rent bewteen 1000 and 2000,6.634657e-07,0.0,0.0,2.244769e-08,2.772824e-07,-3.873686e-09,-2.569397e-09,-1.343204e-08,0.0,...,4.185607e-07,2.872724e-07,0.0,-4.297007e-08,-8.940854e-08,-2.869311e-08,-3.124633e-07,3.094584e-07,1.25649e-07,1e-06
5,rent bewteen 300 and 1000,-7.77412e-07,0.0,-1.625348e-07,2.636541e-08,2.381383e-07,-1.030821e-07,-2.701257e-08,0.0,0.0,...,3.440705e-07,4.846341e-07,4.76583e-08,-1.39688e-07,0.0,-1.070483e-07,0.0,4.070312e-07,1.780477e-07,-1e-06


In [28]:
#we concatenate all created frames into a single one to export the info to a CSV

frames = [lasso_coefficients_age, lasso_coefficients_education, lasso_coefficients_housing_values, lasso_coefficients_income, 
          lasso_coefficients_ownorrent, lasso_coefficients_race, lasso_coefficients_transportationtype,
          lasso_coefficients_typeof_household]

lasso_coefficients_residents = pd.concat(frames)

In [32]:
lasso_coefficients_residents = lasso_coefficients_residents.reset_index(drop=True)
del(lasso_coefficients_residents['total_calls'])
lasso_coefficients_residents

request type,regressor,Blocked Driveway,Broken Muni Meter,Building/Use,Consumer Complaint,DOOR/WINDOW,Damaged Tree,Dead Tree,Derelict Vehicle,Derelict Vehicles,...,Overgrown Tree/Branches,PAINT/PLASTER,PLUMBING,Rodent,Sanitation Condition,Sewer,Snow,Taxi Complaint,UNSANITARY CONDITION,WATER LEAK
0,Population under 18,0.0,-8.930901e-08,0.0,-5.278336e-08,3.238429e-07,0.0,0.0,0.0,0.0,...,0.0,6.982326e-07,7.075701e-07,0.0,-1.050657e-07,0.0,-1.389717e-09,-4.717656e-07,6.615595e-07,2.499854e-07
1,population 65 and over,-9.945644e-07,0.0,0.0,-1.14183e-07,-3.523755e-07,0.0,0.0,0.0,0.0,...,0.0,-7.75477e-07,-7.130643e-07,0.0,-3.413896e-07,0.0,0.0,0.0,-4.312124e-07,-2.029068e-07
2,population between 18 and 34,-4.593221e-07,0.0,-7.017873e-08,3.549444e-08,2.485794e-07,-9.156741e-08,-3.562998e-08,-5.689704e-08,-2.176532e-08,...,-7.342568e-08,1.705973e-07,1.889629e-07,1.257509e-07,-1.876113e-07,-1.196145e-07,-7.055597e-08,2.1911e-07,2.459253e-07,0.0
3,population between 35 to 64,7.60635e-07,0.0,7.110786e-08,5.020019e-08,-1.416407e-07,0.0,0.0,0.0,0.0,...,0.0,-2.042394e-07,-2.352516e-07,-1.014196e-07,2.477537e-07,0.0,0.0,-3.823926e-08,-3.342294e-07,-3.457859e-08
4,population education bachelors,0.0,2.947763e-07,-4.002136e-08,1.822192e-07,-1.399126e-07,0.0,0.0,0.0,-4.200915e-08,...,0.0,-3.085825e-07,-2.997973e-07,0.0,-4.09431e-08,0.0,0.0,1.753912e-07,-2.647742e-07,-1.015426e-07
5,population education high school,6.178494e-07,-1.265499e-07,1.332577e-07,-3.184174e-08,2.367085e-07,0.0,0.0,0.0,1.686023e-08,...,0.0,2.45445e-08,3.830178e-08,-3.556215e-08,-9.749594e-10,0.0,-1.499675e-08,-5.031984e-07,3.923098e-08,0.0
6,population education masters,-3.805191e-07,0.0,0.0,-2.595743e-07,-3.714614e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.069222e-07,0.0,0.0,0.0,0.0,0.0
7,population education phd,0.0,0.0,0.0,4.403008e-08,2.112875e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.18157e-06,0.0,0.0,0.0,0.0,0.0
8,house value 500 or more,0.0,0.0,0.0,-9.342912e-08,-3.369642e-07,0.0,0.0,0.0,0.0,...,0.0,-9.399455e-07,-8.915244e-07,-6.400767e-09,5.906147e-08,0.0,0.0,0.0,-8.135118e-07,-3.120906e-07
9,house value for 100 to 500,1.158093e-07,0.0,1.444345e-07,-1.798678e-08,-4.651799e-07,0.0,0.0,0.0,0.0,...,3.170568e-10,-7.635463e-07,-5.374152e-07,-8.894885e-08,1.794756e-07,0.0,0.0,-8.783264e-08,-5.349025e-07,-1.620469e-07


In [33]:
lasso_coefficients_residents.to_csv('lasso_coefficients_residents_by_feature_type.csv')

In [40]:
demographics_NTA_NYC_workers.columns

Index([u'Neighborhood', u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n',
       u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n',
       u'family households_n', u'nonfamily households_n',
       u'population education high school_n',
       u'population education bachelors_n', u'population education masters_n',
       u'population education phd_n', u'household income less than 10_n',
       u'owner  occupied units_n', u'renter occupied units_n',
       u'cars per capita', u'transportation car_n', u'transportation public_n',
       u'tranportation motorcycle_n', u'total workers',
       u'household income form 10 to 40_n',
       u'household income form 40 to 75_n', u'household income 75 and above_n',
       u'house value for 20 to 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n', u'rent bewteen 300 and 1000_n',
       u'rent bew

In [41]:
#Defining the selected features for the innitial resident lasso regression
#we will use this innitial regression to determine a selected list of types of complaints 
#that meet a certain threadshold to then perform a selective regression by feature types
workers_features = [u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n',
       u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n',
       u'family households_n', u'nonfamily households_n',
       u'population education high school_n',
       u'population education bachelors_n', u'population education masters_n',
       u'population education phd_n', u'household income less than 10_n',
       u'owner  occupied units_n', u'renter occupied units_n',
       u'cars per capita', u'transportation car_n', u'transportation public_n',
       u'tranportation motorcycle_n', u'total workers',
       u'household income form 10 to 40_n',
       u'household income form 40 to 75_n', u'household income 75 and above_n',
       u'house value for 20 to 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n', u'rent bewteen 300 and 1000_n',
       u'rent bewteen 1000 and 2000_n', u'Transportation Other means_n']       

In [42]:
#This is the list of selected feature types for the second step
age_workers=[u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n']

race_workers=[u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n']

typeof_household_workers=[u'family households_n', u'nonfamily households_n']
education_workers= [u'population education high school_n', u'population education bachelors_n', u'population education masters_n', 
                      u'population education phd_n']
ownorrent_workers = ['owner  occupied units_n',  u'renter occupied units_n']
transportationtype_workers = [u'transportation car_n', u'transportation public_n', 'tranportation motorcycle_n',
                              'Transportation Other means_n']
income_workers = [u'household income form 10 to 40_n', u'household income form 40 to 75_n', u'household income 75 and above_n']
housing_values_workers = [u'house value for 20 to 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n', u'rent bewteen 300 and 1000_n',
       u'rent bewteen 1000 and 2000_n', u'rent 2000 or more_n']        

In [43]:
#First set of regressions using all the demographic features for residents and calls by type
#we are defining an innitial threadshold on the total number of calls per type of call
#we request that the normalized number of calls is greater than the mean for all types.
results={}
for typeof in types_of_calls:
    A2=np.append(np.append(workers_features,typeof),'Neighborhood')   #selection of columns
    myframe1=callsbytype_attributes_workers[A2].dropna() 
    if myframe1[typeof].sum() > calls_bytype_normalized.total_calls.mean() :
        results[typeof]={}
        X=myframe1[workers_features]
        Y=myframe1[typeof]
        #LASSO
        X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
        X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
        R2_store=[]
        for i in range(-40,40):
            Lasso = linear_model.Lasso(fit_intercept=True,alpha=i)
            Lasso.fit(X_train,Y_train)
            R2_OS = 1 - np.var(Lasso.predict(X_val) - Y_val) / np.var(Y_val)
            R2_store.append(R2_OS)
        optim_alpha=range(-40,40)[np.where(R2_store==np.max(R2_store))[0][0]]
        results[typeof]['best_alpha']=optim_alpha
        Lasso = linear_model.Lasso(fit_intercept=True,alpha=optim_alpha)
        Lasso.fit(X_train,Y_train)
        R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
        results[typeof]['best_R2']=R2_OS
        results[typeof]['sample']=len(myframe1)
        results[typeof]['normalized calls'] = myframe1[typeof].sum()
    else:
        pass       



In [44]:
#We create a frame to pass the results of the regressions
results_frame = pd.DataFrame(index=range(0,len(results.keys())), columns=['type','best_alpha','R2','sample_size', 'Normalized calls'])

In [45]:
#And pass the results (stored as a dictionary) into the frame
results_frame['type']=results.keys()
results_frame.head()
for i in results_frame.index:
    typeof=results_frame.loc[i,'type']
    results_frame.iloc[i,1]=results[typeof]['best_alpha']
    results_frame.iloc[i,2]=results[typeof]['best_R2']  
    results_frame.iloc[i,3]=results[typeof]['sample']
    results_frame.iloc[i,4]=results[typeof]['normalized calls']    

In [46]:
#Sorting the frame by the R2 results in a ascending form
results_frame.sort('R2',ascending=False,inplace=True)

  from ipykernel import kernelapp as app


In [47]:
results_frame

Unnamed: 0,type,best_alpha,R2,sample_size,Normalized calls
3,Taxi Complaint,0,0.721118,184,0.406754
12,General Construction/Plumbing,28,0.170042,190,0.600205
25,Noise - Street/Sidewalk,1,0.121152,189,0.953212
10,Noise,4,0.121037,190,1.12104
1,Broken Muni Meter,0,0.119624,177,0.611299
20,Missed Collection (All Materials),1,0.115739,187,0.579051
14,Illegal Parking,2,0.0667202,190,1.78506
36,PAINT/PLASTER,0,0.0504172,188,1.32788
11,Consumer Complaint,29,0.0388694,190,0.41855
9,Noise - Commercial,39,0.0304777,188,0.866048


In [48]:
#We create a new fraame subsetting the results one, selecting only the types of complaints
#with a R2 positive result
top_R2=results_frame[results_frame.R2>=0]
top_R2 = top_R2.reset_index(drop=True)

In [49]:
top_R2

Unnamed: 0,type,best_alpha,R2,sample_size,Normalized calls
0,Taxi Complaint,0,0.721118,184,0.406754
1,General Construction/Plumbing,28,0.170042,190,0.600205
2,Noise - Street/Sidewalk,1,0.121152,189,0.953212
3,Noise,4,0.121037,190,1.12104
4,Broken Muni Meter,0,0.119624,177,0.611299
5,Missed Collection (All Materials),1,0.115739,187,0.579051
6,Illegal Parking,2,0.0667202,190,1.78506
7,PAINT/PLASTER,0,0.0504172,188,1.32788
8,Consumer Complaint,29,0.0388694,190,0.41855
9,Noise - Commercial,39,0.0304777,188,0.866048


In [50]:
#We then proceed to re run lasso, only with the selected types of requests
#using only a selected group of features, in this case age
#and creating a frame to store the resulting cofficients by feature
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(age_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[age_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(age_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=age_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)

lasso_coefficients_age = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_age



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,population 65 and over_n,-4.661598e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.896348e-07,-3.002242e-06,0.0,1.323329e-07
1,population between 18 and 34_n,5.758914e-07,0.0,0.0,0.0,0.0,-1.583225e-08,0.0,0.0,3.734453e-08,3.625531e-07,1.844923e-06,0.0,2.41933e-07
2,population between 35 to 64_n,-2.724306e-07,0.0,-3.535516e-09,0.0,0.0,0.0,2.084211e-07,2.115826e-08,0.0,-1.276029e-07,-3.781701e-07,0.0,-1.085946e-07


In [51]:
#We repeat the same procedure with race features
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(race_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[race_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(race_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=race_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)

lasso_coefficients_race = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_race



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,Population asian_n,8.966711e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.965578e-08,-2.107849e-07,-5.390062e-07,0.0,-3.492248e-07
1,Population white_n,-6.041303e-08,0.0,-3.87188e-09,0.0,2.065732e-08,9.52201e-08,2.282614e-07,1.671285e-08,-6.059895e-08,-2.245238e-08,-3.01035e-07,0.0,5.055724e-08
2,population black_n,-4.017931e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.693642e-08,1.007167e-06,0.0,-3.690699e-07
3,population hispanic_n,4.932162e-07,0.0,0.0,0.0,-3.292203e-08,-2.04078e-07,0.0,0.0,1.979619e-07,1.043524e-07,1.190505e-06,1.008695e-10,1.583424e-07
4,population other _n,-2.820217e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.017388e-07,-1.016932e-05,0.0,3.576762e-06


In [52]:
#We repeat the same procedure with type of household features
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(typeof_household_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[typeof_household_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(typeof_household_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=typeof_household_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)

lasso_coefficients_typeof_household = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_typeof_household



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,family households_n,-1.039274e-07,0.0,-3.090586e-09,0.0,7.442549e-10,-2.839273e-09,1.535925e-07,2.128693e-08,-8.739639e-09,-8.809884e-08,-3.943385e-07,0.0,-2.003541e-07
1,nonfamily households_n,2.851197e-07,0.0,0.0,0.0,0.0,-9.47375e-09,3.059785e-08,0.0,4.525875e-08,1.302194e-07,5.230246e-07,0.0,4.063496e-07


In [53]:
#We repeat the same procedure with education levels  
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(education_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[education_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(education_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=education_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_education = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_education



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,population education bachelors_n,1.849841e-07,0.0,0.0,0.0,0.0,0.0,4.53551e-07,0.0,5.338024e-08,6.596528e-08,-2.076011e-06,0.0,3.659935e-07
1,population education high school_n,1.655028e-08,0.0,-8.727685e-09,0.0,0.0,-2.222405e-08,0.0,0.0,0.0,-9.815532e-08,5.47883e-07,0.0,-3.378217e-07
2,population education masters_n,-4.85261e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.583613e-08,-3.581202e-07,0.0,1.566995e-07
3,population education phd_n,4.059454e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.458798e-07,2.291379e-05,0.0,8.306868e-07


In [54]:
#We repeat the same procedure with type of resident (owner vs renter) 
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(ownorrent_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[ownorrent_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(ownorrent_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=ownorrent_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_ownorrent = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_ownorrent



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,owner occupied units_n,-4.093215e-07,0.0,-5.302289e-09,0.0,3.28876e-09,1.236892e-07,0.0,0.0,-2.59783e-07,-1.239924e-07,-1.024736e-06,-3.014952e-08,-1.88329e-08
1,renter occupied units_n,3.67009e-07,0.0,0.0,0.0,0.0,-9.180187e-08,1.857491e-07,2.430527e-08,1.856917e-07,8.060506e-08,6.101141e-07,1.898521e-08,1.062766e-07


In [55]:
#We repeat the same procedure with transportation type for workers 
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(transportationtype_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[transportationtype_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(transportationtype_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=transportationtype_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_transportationtype = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_transportationtype



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,Transportation Other means_n,-1.176489e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-9.673136e-08,5e-06,0.0,1.268545e-06
1,tranportation motorcycle_n,1.648268e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.663574e-07,-0.00016,0.0,-6.296798e-06
2,transportation car_n,-5.858282e-07,0.0,-6.527892e-09,0.0,0.0,0.0,3.677484e-07,0.0,0.0,-1.829578e-07,-3e-06,0.0,-1.121987e-07
3,transportation public_n,7.839735e-07,0.0,0.0,0.0,0.0,-2.305875e-08,2.038493e-08,0.0,5.649002e-08,2.569611e-07,4e-06,0.0,-6.086816e-08


In [56]:
#We repeat the same procedure with income levels 
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(income_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[income_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(income_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=income_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_income = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_income



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,household income 75 and above_n,5.546774e-08,0.0,-1.890361e-09,0.0,2.670778e-10,1.837817e-08,1.172299e-07,1.966805e-08,0.0,1.032222e-08,-3.90336e-08,0.0,1.335722e-07
1,household income form 10 to 40_n,1.08325e-07,0.0,0.0,0.0,0.0,-4.558843e-08,0.0,0.0,2.849688e-08,1.042261e-07,1.982486e-06,0.0,-1.767491e-07
2,household income form 40 to 75_n,-1.13416e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.401755e-07,-2.297357e-06,0.0,3.583495e-08


In [57]:
#Finally we repeat it for housing values  
lasso_coefficients =  pd.DataFrame(columns=['regressor','coefficient', 'request type'])
for i in range (0, len(top_R2)):
    A2=np.append(np.append(housing_values_workers, top_R2['type'][i]),'Neighborhood')  
    myframe1 = callsbytype_attributes_workers[A2].dropna() 
    X=myframe1[housing_values_workers]
    Y=myframe1[top_R2['type'][i]]
    #LASSO
    X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
    X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
    Lasso = linear_model.Lasso(fit_intercept=True,alpha=top_R2['best_alpha'][i])
    Lasso.fit(X_train,Y_train)
    R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
    lc = pd.DataFrame(index=range(0,len(housing_values_workers)), columns=['regressor','coefficient', 'request type'])
    lc['regressor']=housing_values_workers
    lc['coefficient']=Lasso.coef_
    lc['request type']=top_R2['type'][i]
    lasso_coefficients = pd.concat([lasso_coefficients, lc]).reset_index(drop=True)
lasso_coefficients_housing_values = pd.pivot_table(lasso_coefficients, values='coefficient', index=['regressor'], columns=['request type']).reset_index()
lasso_coefficients_housing_values



request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,house value 500 or more_n,5.050165e-07,0.0,0.0,0.0,1.918503e-09,0.0,2.133253e-07,0.0,0.0,-5.324501e-08,-5.706658e-07,0.0,-7.514307e-08
1,house value for 100 to 500_n,-3.571403e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.429524e-07,-8.591541e-07,0.0,-6.720351e-08
2,house value for 20 to 100_n,2.684053e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.262707e-07,7.195676e-06,0.0,1.552833e-07
3,rent 2000 or more_n,-3.38221e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.243054e-07,2.067773e-07,0.0,4.67505e-07
4,rent bewteen 1000 and 2000_n,-7.763327e-08,0.0,-3.549221e-09,0.0,0.0,0.0,0.0,2.137823e-08,0.0,-8.35943e-09,-2.914826e-07,0.0,1.679887e-08
5,rent bewteen 300 and 1000_n,-2.714035e-07,0.0,0.0,0.0,0.0,-2.072947e-08,0.0,0.0,4.903957e-08,6.646187e-08,1.155489e-06,0.0,-3.086435e-08


In [58]:
#we concatenate all created frames into a single one to export the info to a CSV

frames = [lasso_coefficients_age, lasso_coefficients_education, lasso_coefficients_housing_values, lasso_coefficients_income, 
          lasso_coefficients_ownorrent, lasso_coefficients_race, lasso_coefficients_transportationtype,
          lasso_coefficients_typeof_household]

lasso_coefficients_workers = pd.concat(frames)

In [59]:
lasso_coefficients_workers = lasso_coefficients_workers.reset_index(drop=True)
lasso_coefficients_workers

request type,regressor,Broken Muni Meter,Consumer Complaint,GENERAL CONSTRUCTION,General Construction/Plumbing,Illegal Parking,Missed Collection (All Materials),Noise,Noise - Commercial,Noise - Street/Sidewalk,Noise - Vehicle,PAINT/PLASTER,Rodent,Taxi Complaint
0,population 65 and over_n,-4.661598e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.896348e-07,-3.002242e-06,0.0,1.323329e-07
1,population between 18 and 34_n,5.758914e-07,0.0,0.0,0.0,0.0,-1.583225e-08,0.0,0.0,3.734453e-08,3.625531e-07,1.844923e-06,0.0,2.41933e-07
2,population between 35 to 64_n,-2.724306e-07,0.0,-3.535516e-09,0.0,0.0,0.0,2.084211e-07,2.115826e-08,0.0,-1.276029e-07,-3.781701e-07,0.0,-1.085946e-07
3,population education bachelors_n,1.849841e-07,0.0,0.0,0.0,0.0,0.0,4.53551e-07,0.0,5.338024e-08,6.596528e-08,-2.076011e-06,0.0,3.659935e-07
4,population education high school_n,1.655028e-08,0.0,-8.727685e-09,0.0,0.0,-2.222405e-08,0.0,0.0,0.0,-9.815532e-08,5.47883e-07,0.0,-3.378217e-07
5,population education masters_n,-4.85261e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.583613e-08,-3.581202e-07,0.0,1.566995e-07
6,population education phd_n,4.059454e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.458798e-07,2.291379e-05,0.0,8.306868e-07
7,house value 500 or more_n,5.050165e-07,0.0,0.0,0.0,1.918503e-09,0.0,2.133253e-07,0.0,0.0,-5.324501e-08,-5.706658e-07,0.0,-7.514307e-08
8,house value for 100 to 500_n,-3.571403e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.429524e-07,-8.591541e-07,0.0,-6.720351e-08
9,house value for 20 to 100_n,2.684053e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.262707e-07,7.195676e-06,0.0,1.552833e-07


In [60]:
lasso_coefficients_workers.to_csv('lasso_coefficients_workers_by_feature_type.csv')