In [1]:
import pandas as pd 
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
#import pylab as plt
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import statsmodels.api as sm
%matplotlib inline

###This notebook will focus on the analytical part of the 311 demographics analysis: that is, multilinear regressions between resident and working population socio-demographic attributes and  the number of 311 calls by type per capita, at the census tract level will be performed and analyzed

In [2]:
#Upload the working population attributes, and the resident population attributes at the CT level. Then, merging both datasets
demographics_CT_NYC_residents=pd.read_csv('../datasets/version2/demographics_CT_NYC_residents.csv').drop('Geo_FIPS',axis=1)
demographics_CT_NYC_workers=pd.read_csv('../datasets/version2/demographics_CT_NYC_workers.csv').drop('Geo_FIPS',axis=1)
demographics_CT_NYC=pd.merge(demographics_CT_NYC_residents,demographics_CT_NYC_workers,on='BoroCT2010',how='inner')
print len(demographics_CT_NYC_residents),len(demographics_CT_NYC_workers),len(demographics_CT_NYC)

2167 2149 2149


In [3]:
residents_demographics=demographics_CT_NYC_residents.columns[1:]
workers_demographics=demographics_CT_NYC_workers.columns[1:]

In [4]:
#Upload the 311 calls by type per capita
calls_bytype_normalized=pd.read_csv('../datasets/version2/Call by type with normalization by resident - Census Tract level.csv').drop('Unnamed: 0',axis=1)

In [5]:
#callsbytype_attributes will be a dataframe combining all the information (demographics + calls by type)
callsbytype_attributes=pd.merge(calls_bytype_normalized,demographics_CT_NYC, on='BoroCT2010',how='inner')
print len(callsbytype_attributes), len(calls_bytype_normalized),len(demographics_CT_NYC)

1977 1978 2149


In [6]:
regressors=demographics_CT_NYC.columns[1:]  #demographic indicators
types_of_calls=calls_bytype_normalized.columns[1:]  #types of calls

Lets Consider different groups of population $g=1,2,…,n$ (based on our demographic indicators) and let:


$Pr(a,g)$ - the total number of residents in the location $a$ of group $g$ 

while $Pc(a,g)$ the number of commuters.
 
Let the unknown (subject to fit) complaining behavior be defined by the average number of complains of type $t$ per resident of group $g$ within his/her place of residency be $rc(g,t)$

Let also, $wc(g,t)$ be the number of complains of type $t$ per commuter of type $g$.

Then the total observed number of complains of type $t$ in the area $a$ is:

$$C(a,t)=\sum_{g,t} Pr(a,g) \ rc(g,t) + \sum_{g,t} Pc(a,g) \ wc(g,t) \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  \text{(1)}$$ 

Then we know $Pr(a,g)$ and $Pc(a,g)$ (those are our regressors), we know the output variable $C(a,t)$ from 311 statistics. We need to fit the $rc(g,t)$, $wc(g,t)$ - slope coefficients of the multivariate linear regression.

This will give us complaining behavior per people of different groups and it will be distinguished by the complaining mode - while at home and while on the way.



###we will procced as follows:

STEP 1) Lasso regression:

Regressors:  

$Pr(a,g)$ - the total number of residents in the location $a$ of group $g$.
            
$Pc(a,g)$  number of commuters in the location a of group $g$

Target variable to be fit: $rc(g,t)$ -   average number of complains of type $t$ per resident of group $g$                                                      within his/her place of residency 

STEP 2) predict the number of complains per capita $wc(g,t)$ from the results of step 1, using equation $(1)$

Using the predicted value $rc(g,t)$ in each area, we are able to get a $wc(g,t)$ prediction (from the formula of the observed total calls by type $C(a,t)$ variable) 

In [7]:
results={}
for typeof in types_of_calls:
    A2=np.append(np.append(regressors,typeof),'BoroCT2010')   #selection of columns
    myframe1=callsbytype_attributes[A2].dropna() 
    if len(myframe1)>100:
        results[typeof]={}
        X=myframe1[regressors]
        Y=myframe1[typeof]
        #LASSO
        X_pre_train, X_test, label_pre_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
        X_train, X_val, Y_train, Y_val = train_test_split(X_pre_train, label_pre_train, test_size=0.25, random_state=1)
        R2_store=[]
        for i in range(-40,40):
            Lasso = linear_model.Lasso(fit_intercept=True,alpha=i)
            Lasso.fit(X_train,Y_train)
            R2_OS = 1 - np.var(Lasso.predict(X_val) - Y_val) / np.var(Y_val)
            R2_store.append(R2_OS)
        optim_alpha=range(-40,40)[np.where(R2_store==np.max(R2_store))[0][0]]
        results[typeof]['best_alpha']=optim_alpha
        Lasso = linear_model.Lasso(fit_intercept=True,alpha=optim_alpha)
        Lasso.fit(X_train,Y_train)
        R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
        results[typeof]['best_R2']=R2_OS
        results[typeof]['sample']=len(myframe1)

  positive)


In [8]:
results_frame = pd.DataFrame(index=range(0,len(results.keys())), columns=['type','best_alpha','R2','sample_size'])

In [9]:
results_frame['type']=results.keys()
results_frame.head()
for i in results_frame.index:
    typeof=results_frame.loc[i,'type']
    results_frame.iloc[i,1]=results[typeof]['best_alpha']
    results_frame.iloc[i,2]=results[typeof]['best_R2']  
    results_frame.iloc[i,3]=results[typeof]['sample'] 

In [15]:
results_frame.sort('R2',ascending=False,inplace=True)

In [131]:
#save the progress so far
results_frame.to_csv('../outputs/Lasso_results1.csv')

In [16]:
results_frame

Unnamed: 0,type,best_alpha,R2,sample_size
46,Noise,0,0.5399967,1911
26,DOF Parking - Tax Exemption,4,0.449764,199
19,PLUMBING,0,0.4219736,1806
83,DOOR/WINDOW,0,0.4012606,1631
56,HEATING,0,0.3696403,1873
28,Homeless Encampment,1,0.3433813,756
105,PAINT/PLASTER,0,0.3400187,1747
98,Overgrown Tree/Branches,0,0.333279,1793
121,total_calls,39,0.3332756,1977
84,Root/Sewer/Sidewalk Condition,0,0.3324231,1535


Now, lets select the top complains and analyze the regression coefficients

In [106]:
top_R2=results_frame[results_frame.R2>=0.1]

In [107]:
top_list=np.array(top_R2['type'])

In [125]:
typeof=top_list[0]
A2=np.append(np.append(regressors,typeof),'BoroCT2010')   #selection of columns
myframe1=callsbytype_attributes[A2].dropna() 
X=myframe1[regressors]
Y=myframe1[typeof]
#LASSO
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
Lasso = linear_model.Lasso(fit_intercept=True,alpha=0)
Lasso.fit(X_train,Y_train)
R2_OS = 1 - np.var(Lasso.predict(X_test) - Y_test) / np.var(Y_test)
print R2_OS

0.61670057575




In [126]:
lasso_coefficients = pd.DataFrame(index=range(0,len(regressors)), columns=['regressor','coefficient'])

In [127]:
lasso_coefficients['regressor']=regressors
lasso_coefficients['coefficient']=Lasso.coef_

In [128]:
n=lasso_coefficients['coefficient']
thr=np.percentile(n, 90)
lasso_coefficients[np.abs(lasso_coefficients['coefficient'])>thr].sort('coefficient')

Unnamed: 0,regressor,coefficient
104,house value from 20 to 50_n,-0.000173
5,Median Age,-6.4e-05
84,population education phd_n,-6.1e-05
112,rent less than 300_n,-4.3e-05
92,household income from 40 to 45_n,-3.2e-05
125,transportation walk_n,-2.7e-05
63,tranportation motorcycle,-2.5e-05
40,house value less than 20,-2.1e-05
36,household income 200 or more,-1.9e-05
106,house value from 100 to 150_n,-1.2e-05
