In [1]:
import pandas as pd 
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
#import pylab as plt
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import statsmodels.api as sm
%matplotlib inline

###This notebook will focus on the analytical part of the 311 demographics analysis: that is, multilinear regressions between resident and working population socio-demographic attributes and  the number of 311 calls by type per capita, at the NTA level will be performed and analyzed

In [2]:
#Upload the working population attributes, and the resident population attributes at the CT level.
#We keep both datasets separated in order to generate separate regressions per type of features
demographics_NTA_NYC_residents=pd.read_csv('demographics_nta_NYC_residents_compiled.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
demographics_NTA_NYC_workers=pd.read_csv('demographics_nta_NYC_workers_compiled.csv').drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
demographics_NTA_NYC=pd.merge(demographics_NTA_NYC_residents,demographics_NTA_NYC_workers,on='Neighborhood',how='inner')
print len(demographics_NTA_NYC_residents),len(demographics_NTA_NYC_workers),len(demographics_NTA_NYC)

195 194 194


In [3]:
#Upload the 311 calls by type, normalized by total resident population
calls_bytype_normalized=pd.read_csv('Call by type with normalization by resident - NTA level.csv').drop('Unnamed: 0', axis=1)

In [4]:
#Delete the total population from the dataset since we are not going to use it
del(calls_bytype_normalized['Total Population'])

In [5]:
#Upload the 311 calls by type, non-normalized and the total worker poulation

calls_bytype_nonnormalized=pd.read_csv('Call by type without normalization - NTA level.csv').drop('Unnamed: 0', axis=1)
workers = pd.read_csv('demographics_NTA_NYC_workers.csv')

In [6]:
workers = workers[['Neighborhood', 'total workers']]

In [7]:
calls_bytype_nonnormalized = pd.merge(calls_bytype_nonnormalized, workers, on='Neighborhood')

In [8]:
#Normalizing the total calls by type per working population
calls_bytype_normalized_workers = calls_bytype_nonnormalized[calls_bytype_nonnormalized.columns[1:]].astype(float).div(calls_bytype_nonnormalized['total workers'].astype(float), axis=0)

In [9]:
calls_bytype_normalized_workers['Neighborhood'] = calls_bytype_nonnormalized['Neighborhood']

In [10]:
#Delete the total population from the dataset since we are not going to use it
del(calls_bytype_normalized_workers['total workers'])

In [33]:
#callsbytype_attributes will be a dataframe combining all the information (demographics + calls by type)
#we created to separate frames, one for residents and one for workers
callsbytype_attributes_residents=pd.merge(calls_bytype_normalized,demographics_NTA_NYC_residents, on='Neighborhood',how='inner')
callsbytype_attributes_workers=pd.merge(calls_bytype_normalized_workers,demographics_NTA_NYC_workers, on='Neighborhood',how='inner')
print len(callsbytype_attributes_residents), len(callsbytype_attributes_workers)

190 190


In [29]:
types_of_calls=calls_bytype_normalized.columns[:-2]  #types of calls
#calls_bytype_normalized.columns[:-3]

In [21]:
#Defining  a list of rgeressors
regressors=[u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over',
       u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race',
       u'family households', u'nonfamily households',
       u'population education high school', u'population education bachelors',
       u'population education masters', u'population education phd',
        u'owner  occupied units',
       u'renter occupied units', u'transportation car', u'number of cars',
       u'transportation public', u'tranportation motorcycle',
        u'household income form 10 to 40',
       u'household income form 40 to 75', u'household income 75 and above',
       u'house value for 20 to 100', u'house value for 100 to 500',
       u'house value 500 or more', u'rent bewteen 300 and 1000',
       u'rent bewteen 1000 and 2000', u'rent 2000 or more',
       u'Transportation Other means', u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n',
       u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n',
       u'family households_n', u'nonfamily households_n',
       u'population education high school_n',
       u'population education bachelors_n', u'population education masters_n',
       u'population education phd_n', u'household income less than 10_n',
       u'owner  occupied units_n', u'renter occupied units_n',
       u'transportation car_n', u'transportation public_n',
       u'tranportation motorcycle_n', 
       u'household income form 10 to 40_n',
       u'household income form 40 to 75_n', u'household income 75 and above_n',
       u'house value for 20 to 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n', u'rent bewteen 300 and 1000_n',
       u'rent bewteen 1000 and 2000_n', u'Transportation Other means_n']

In [22]:
#Defining the selected features for the innitial resident OLS regression
#we will use this innitial regression to determine a selected list of types of complaints 
#that meet a certain threadshold to then perform a selective regression by feature types
resident_features = [u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over', u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race', u'family households', u'nonfamily households', u'population education high school', 
       u'population education bachelors', u'population education masters', u'population education phd', 'owner  occupied units',  
       u'renter occupied units', u'transportation car', u'transportation public', 'tranportation motorcycle', 
       u'Transportation Other means', u'number of cars', u'household income form 10 to 40', u'household income form 40 to 75', 
       u'household income 75 and above', u'house value for 20 to 100', u'house value for 100 to 500',
       u'house value 500 or more', u'rent bewteen 300 and 1000', u'rent bewteen 1000 and 2000', u'rent 2000 or more']                     

In [49]:
#Defining the selected features for the innitial workers OLS regression
#we will use this innitial regression to determine a selected list of types of complaints 
#that meet a certain threadshold to then perform a selective regression by feature types
workers_features = [u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n', u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n', u'family households_n', u'nonfamily households_n', u'population education high school_n', u'population education bachelors_n', u'population education masters_n', 
       u'population education phd_n', 'owner  occupied units_n',  u'renter occupied units_n', u'transportation car_n', u'transportation public_n',
       u'tranportation motorcycle_n', u'Transportation Other means_n', u'household income form 10 to 40_n', u'household income form 40 to 75_n', 
       u'household income 75 and above_n', u'house value for 20 to 100_n', u'house value for 100 to 500_n', u'house value 500 or more_n', 
       u'rent bewteen 300 and 1000_n', u'rent bewteen 1000 and 2000_n', u'rent 2000 or more_n']       

In [23]:
#This is the list of selected feature types for the second step for the resident group
age_residents=[u'Population under 18', u'population between 18 and 34',
       u'population between 35 to 64', u'population 65 and over']

race_residents=[u'Population white', u'population black', u'Population asian',
       u'population hispanic', u'population other race']

typeof_household_residents=[u'family households', u'nonfamily households']
education_residents= [u'population education high school', u'population education bachelors', u'population education masters', 
                      u'population education phd']
ownorrent_residents = ['owner  occupied units',  u'renter occupied units']
transportationtype_residents = [u'transportation car', u'transportation public', 'tranportation motorcycle',
                              'Transportation Other means', u'number of cars']
income_residents = [u'household income form 10 to 40', u'household income form 40 to 75', u'household income 75 and above']
housing_values_residents = [u'house value for 20 to 100', u'house value for 100 to 500',
       u'house value 500 or more']
rent_residents=[u'rent bewteen 300 and 1000',u'rent bewteen 1000 and 2000', u'rent 2000 or more']
variable_groups_residents=['age_residents','race_residents','typeof_household_residents',
                          'education_residents','ownorrent_residents','transportationtype_residents',
                          'income_residents','housing_values_residents','rent_residents']

In [24]:
#This is the list of selected feature types for the second step for the workers group
age_workers=[u'population between 18 and 34_n',
       u'population between 35 to 64_n', u'population 65 and over_n']

race_workers=[u'Population white_n', u'population black_n', u'Population asian_n',
       u'population hispanic_n', u'population other _n']

typeof_household_workers=[u'family households_n', u'nonfamily households_n']
education_workers= [u'population education high school_n', u'population education bachelors_n', u'population education masters_n', 
                      u'population education phd_n']
ownorrent_workers = ['owner  occupied units_n',  u'renter occupied units_n']
transportationtype_workers = [u'transportation car_n', u'transportation public_n', 'tranportation motorcycle_n',
                              'Transportation Other means_n']
income_workers = [u'household income form 10 to 40_n', u'household income form 40 to 75_n', u'household income 75 and above_n']
housing_values_workers = [u'house value for 20 to 100_n', u'house value for 100 to 500_n',
       u'house value 500 or more_n']       
rent_workers= [ u'rent bewteen 300 and 1000_n',
       u'rent bewteen 1000 and 2000_n', u'rent 2000 or more_n'] 
variable_groups_workers=['age_workers','race_workers','typeof_household_workers','education_workers','ownorrent_workers',
                        'transportationtype_workers','income_workers','housing_values_workers','rent_workers']

Lets Consider different groups of population $g=1,2,…,n$ (based on our demographic indicators) and let:


$Pr(a,g)$ - the total number of residents in the location $a$ of group $g$ 

while $Pc(a,g)$ the number of commuters.
 
Let the unknown (subject to fit) complaining behavior be defined by the average number of complains of type $t$ per resident of group $g$ within his/her place of residency be $rc(g,t)$

Let also, $wc(g,t)$ be the number of complains of type $t$ per commuter of type $g$.

Then the total observed number of complains of type $t$ in the area $a$ is:

$$C(a,t)=\sum_{g,t} Pr(a,g) \ rc(g,t) + \sum_{g,t} Pc(a,g) \ wc(g,t) \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \  \text{(1)}$$ 

Then we know $Pr(a,g)$ and $Pc(a,g)$ (those are our regressors), we know the output variable $C(a,t)$ from 311 statistics. We need to fit the $rc(g,t)$, $wc(g,t)$ - slope coefficients of the multivariate linear regression.

This will give us complaining behavior per people of different groups and it will be distinguished by the complaining mode - while at home and while on the way.



###we will procced as follows:

STEP 1) Lasso regression:

Regressors:  

$Pr(a,g)$ - the total number of residents in the location $a$ of group $g$.
            
$Pc(a,g)$  number of commuters in the location a of group $g$

Target variable to be fit: $rc(g,t)$ -   average number of complains of type $t$ per resident of group $g$                                                      within his/her place of residency 

STEP 2) predict the number of complains per capita $wc(g,t)$ from the results of step 1, using equation $(1)$

Using the predicted value $rc(g,t)$ in each area, we are able to get a $wc(g,t)$ prediction (from the formula of the observed total calls by type $C(a,t)$ variable) 

In [34]:
#FIRST, I WILL RUN OLS OVER EVERY TYPE OF COMPLAIN
ols_coefficients_dict={}
for typeof in types_of_calls:
    ols_coefficients_dict[typeof]={}
    A2=np.append(np.append(resident_features ,typeof),'Neighborhood')   #selection of columns
    myframe1=callsbytype_attributes_residents[A2].dropna() 
    if len(myframe1)>50:
        for data_group in variable_groups_residents:
            ols_coefficients_dict[typeof][data_group]={}
            #ols_coefficients_dict[typeof]['coefficients']={}
            #ols_coefficients_dict[typeof]['interval']={}
            features=eval(data_group)           
            A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
            myframe1=callsbytype_attributes[A2].dropna() 
            X=myframe1[features]
            Y=myframe1[typeof]
            ordinary_LS=sm.OLS(Y, X).fit()
            ols_coefficients_dict[typeof][data_group]['coefficients']=ordinary_LS.params
            ols_coefficients_dict[typeof][data_group]['interval']=ordinary_LS.conf_int()
            ols_coefficients_dict[typeof][data_group]['Rsquared']=ordinary_LS.rsquared

In [35]:
#EXAMPLE OF THE OUTCOME
ols_coefficients_dict['Noise - Commercial']['age_residents']

{'Rsquared': 0.31423003142503847,
 'coefficients': Population under 18            -4.937346e-07
 population between 18 and 34    9.214939e-07
 population between 35 to 64    -1.589607e-07
 population 65 and over         -6.199498e-08
 dtype: float64,
 'interval':                                          0             1
 Population under 18          -8.121159e-07 -1.753533e-07
 population between 18 and 34  5.615133e-07  1.281475e-06
 population between 35 to 64  -6.473967e-07  3.294754e-07
 population 65 and over       -8.520514e-07  7.280614e-07}

In [36]:
len(ols_coefficients_dict)

178

In [38]:
#NOW, RUN OLS OVER EVERY TYPE OF COMPLAIN WITH AN R2 HIGHER THAN 0.2 TO SUBSET THE TYPES OF 
#COMPLAINTS WE'RE USING
ols_coefficients_dict={}
for typeof in types_of_calls:
    ols_coefficients_dict[typeof]={}
    A2=np.append(np.append(resident_features ,typeof),'Neighborhood')   #selection of columns
    myframe1=callsbytype_attributes_residents[A2].dropna() 
    if len(myframe1)>50:
        for data_group in variable_groups_residents:
            ols_coefficients_dict[typeof][data_group]={}
            #ols_coefficients_dict[typeof]['coefficients']={}
            #ols_coefficients_dict[typeof]['interval']={}
            features=eval(data_group)           
            A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
            myframe1=callsbytype_attributes[A2].dropna() 
            X=myframe1[features]
            Y=myframe1[typeof]
            ordinary_LS=sm.OLS(Y, X).fit()
            rsq=ordinary_LS.rsquared
            if rsq>=0.2:
                ols_coefficients_dict[typeof][data_group]['coefficients']=ordinary_LS.params
                ols_coefficients_dict[typeof][data_group]['interval']=ordinary_LS.conf_int()
                ols_coefficients_dict[typeof][data_group]['Rsquared']=rsq
ols_coefficients_filtered={}
for typeof in ols_coefficients_dict.keys():
    if len(ols_coefficients_dict[typeof])>0:
        ols_coefficients_filtered[typeof]=ols_coefficients_dict[typeof]

In [39]:
len(ols_coefficients_filtered)

133

Now, from that dictionary, we want to obtain just the coefficients that fall inside the confidence intervals

In [40]:
ols_coefficients_filtered

{'APPLIANCE': {'age_residents': {'Rsquared': 0.56932135113683935,
   'coefficients': Population under 18             1.151141e-07
   population between 18 and 34    6.144492e-08
   population between 35 to 64    -2.206819e-08
   population 65 and over         -8.533714e-08
   dtype: float64,
   'interval':                                          0             1
   Population under 18           7.157245e-08  1.586558e-07
   population between 18 and 34  1.231965e-08  1.105702e-07
   population between 35 to 64  -8.869380e-08  4.455742e-08
   population 65 and over       -1.932822e-07  2.260794e-08},
  'education_residents': {'Rsquared': 0.47522673186489783,
   'coefficients': population education high school    1.499077e-07
   population education bachelors     -2.658516e-08
   population education masters       -7.520183e-08
   population education phd            5.118561e-07
   dtype: float64,
   'interval':                                              0             1
   population e

In [41]:
#coefficients table
coefficient_table=pd.DataFrame(ols_coefficients_filtered['Noise - Commercial']['age_residents']['coefficients'],columns={'coefficient'})
#interval table
conf_interval_table=ols_coefficients_filtered['Noise - Commercial']['age_residents']['interval']
conf_interval_table.columns=['min','max']
#filter for those variables with coefficients within the interval
for eem in coefficient_table.index:
    coeff= coefficient_table.loc[eem,'coefficient']
    min_c= conf_interval_table.loc[eem,'min']
    max_c= conf_interval_table.loc[eem,'max']
    if (coeff<min_c or coeff>max_c):
        coefficient_table.drop(eem,inplace=True)  
coefficient_table

Unnamed: 0,coefficient
Population under 18,-4.937346e-07
population between 18 and 34,9.214939e-07
population between 35 to 64,-1.589607e-07
population 65 and over,-6.199498e-08


In [42]:
#NOW FOR ALL THE TYPES OF COMPLAINS WE OBTAIN THE TABLE:
OLS_results_final={}
for typeof in ols_coefficients_filtered.keys():
    OLS_results_final[typeof]={}
    for subgroup in ols_coefficients_filtered[typeof].keys():
        if len(ols_coefficients_filtered[typeof][subgroup])!=0:
            OLS_results_final[typeof][subgroup]={}
            OLS_results_final[typeof][subgroup]['Rsquared']=ols_coefficients_filtered[typeof][subgroup]['Rsquared']
            coefficient_table=pd.DataFrame(ols_coefficients_filtered[typeof][subgroup]['coefficients'],columns={'coefficient'})
            conf_interval_table=ols_coefficients_filtered[typeof][subgroup]['interval']
            conf_interval_table.columns=['min','max']
            for eem in coefficient_table.index:
                coeff= coefficient_table.loc[eem,'coefficient']
                min_c= conf_interval_table.loc[eem,'min']
                max_c= conf_interval_table.loc[eem,'max']
                if (coeff<min_c or coeff>max_c):
                    coefficient_table.drop(eem,inplace=True)  
            OLS_results_final[typeof][subgroup]['coefficients']=coefficient_table

In [43]:
#NOW FOR ALL THE TYPES OF COMPLAINS THE SAME TABLE WITHOUT THE R2 VALUES:
OLS_results_final={}
for typeof in ols_coefficients_filtered.keys():
    OLS_results_final[typeof]={}
    for subgroup in ols_coefficients_filtered[typeof].keys():
        if len(ols_coefficients_filtered[typeof][subgroup])!=0:
            OLS_results_final[typeof][subgroup]={}
            coefficient_table=pd.DataFrame(ols_coefficients_filtered[typeof][subgroup]['coefficients'],columns={'coefficient'})
            conf_interval_table=ols_coefficients_filtered[typeof][subgroup]['interval']
            conf_interval_table.columns=['min','max']
            for eem in coefficient_table.index:
                coeff= coefficient_table.loc[eem,'coefficient']
                min_c= conf_interval_table.loc[eem,'min']
                max_c= conf_interval_table.loc[eem,'max']
                if (coeff<min_c or coeff>max_c):
                    coefficient_table.drop(eem,inplace=True)  
            OLS_results_final[typeof][subgroup]['coefficients']=coefficient_table

In [44]:
#DUMPING THE DICTIONARY INTO A FRAME
a = pd.concat(map(pd.DataFrame, OLS_results_final.itervalues()), keys=OLS_results_final.keys()).stack().unstack(1).unstack(1)
a.columns = a.columns.droplevel(0)

In [45]:
#CREATING SEPARATE FRAMES FOR EACH GROUP OF DEMOGRAPHIC FEATURES 
dicframe = {}
for data_group in variable_groups_residents:
    c = pd.DataFrame(np.nan, index = eval(data_group), columns = [u'coefficient'])
    dicframe[data_group] = pd.DataFrame()
    for i in range (0, 84):
        d =  a[data_group][i]
        if d is None:
            dicframe[data_group] = pd.concat([dicframe[data_group] , c], axis=1, ignore_index=True)
        else:    
            dicframe[data_group] = pd.concat([dicframe[data_group] , d], axis=1, ignore_index=True)
    dicframe[data_group] = dicframe[data_group].T    

In [46]:
#CREATING A COEFFICIENT FRAME
coefficients = pd.DataFrame()
for data_group in variable_groups_residents:
    coefficients = pd.concat([coefficients, dicframe[data_group]], axis=1)

In [47]:
#sETTING THE FRAME'S INDEX AS THE TYPES OF COMPLAINTS
coefficients = coefficients.set_index(a.index)

In [48]:
#STORING IT IN A CSV FOR VISUALIZATION
coefficients.to_csv('OLS_residents_coefficients.csv')

In [52]:
#FIRST, I WILL RUN OLS OVER EVERY TYPE OF COMPLAIN (FOR WORKERS)
ols_coefficients_dict={}
for typeof in types_of_calls:
    ols_coefficients_dict[typeof]={}
    A2=np.append(np.append(workers_features ,typeof),'Neighborhood')   #selection of columns
    myframe1=callsbytype_attributes_workers[A2].dropna() 
    if len(myframe1)>50:
        for data_group in variable_groups_workers:
            ols_coefficients_dict[typeof][data_group]={}
            #ols_coefficients_dict[typeof]['coefficients']={}
            #ols_coefficients_dict[typeof]['interval']={}
            features=eval(data_group)           
            A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
            myframe1=callsbytype_attributes[A2].dropna() 
            X=myframe1[features]
            Y=myframe1[typeof]
            ordinary_LS=sm.OLS(Y, X).fit()
            ols_coefficients_dict[typeof][data_group]['coefficients']=ordinary_LS.params
            ols_coefficients_dict[typeof][data_group]['interval']=ordinary_LS.conf_int()
            ols_coefficients_dict[typeof][data_group]['Rsquared']=ordinary_LS.rsquared

In [53]:
len(ols_coefficients_dict)

178

In [55]:
#NOW, RUN OLS OVER EVERY TYPE OF COMPLAIN WITH AN R2 HIGHER THAN 0.2 TO SUBSET THE TYPES OF 
#COMPLAINTS WE'RE USING
ols_coefficients_dict={}
for typeof in types_of_calls:
    ols_coefficients_dict[typeof]={}
    A2=np.append(np.append(workers_features,typeof),'Neighborhood')   #selection of columns
    myframe1=callsbytype_attributes_workers[A2].dropna() 
    if len(myframe1)>50:
        for data_group in variable_groups_workers:
            ols_coefficients_dict[typeof][data_group]={}
            #ols_coefficients_dict[typeof]['coefficients']={}
            #ols_coefficients_dict[typeof]['interval']={}
            features=eval(data_group)           
            A2=np.append(np.append(features ,typeof),'Neighborhood')   #selection of columns
            myframe1=callsbytype_attributes[A2].dropna() 
            X=myframe1[features]
            Y=myframe1[typeof]
            ordinary_LS=sm.OLS(Y, X).fit()
            rsq=ordinary_LS.rsquared
            if rsq>=0.2:
                ols_coefficients_dict[typeof][data_group]['coefficients']=ordinary_LS.params
                ols_coefficients_dict[typeof][data_group]['interval']=ordinary_LS.conf_int()
                ols_coefficients_dict[typeof][data_group]['Rsquared']=rsq
ols_coefficients_filtered={}
for typeof in ols_coefficients_dict.keys():
    if len(ols_coefficients_dict[typeof])>0:
        ols_coefficients_filtered[typeof]=ols_coefficients_dict[typeof]

In [56]:
len(ols_coefficients_filtered)

133

In [57]:
#NOW FOR ALL THE TYPES OF COMPLAINS THE SAME TABLE WITHOUT THE R2 VALUES:
OLS_results_final={}
for typeof in ols_coefficients_filtered.keys():
    OLS_results_final[typeof]={}
    for subgroup in ols_coefficients_filtered[typeof].keys():
        if len(ols_coefficients_filtered[typeof][subgroup])!=0:
            OLS_results_final[typeof][subgroup]={}
            coefficient_table=pd.DataFrame(ols_coefficients_filtered[typeof][subgroup]['coefficients'],columns={'coefficient'})
            conf_interval_table=ols_coefficients_filtered[typeof][subgroup]['interval']
            conf_interval_table.columns=['min','max']
            for eem in coefficient_table.index:
                coeff= coefficient_table.loc[eem,'coefficient']
                min_c= conf_interval_table.loc[eem,'min']
                max_c= conf_interval_table.loc[eem,'max']
                if (coeff<min_c or coeff>max_c):
                    coefficient_table.drop(eem,inplace=True)  
            OLS_results_final[typeof][subgroup]['coefficients']=coefficient_table

In [60]:
ols_coefficients_filtered

{'APPLIANCE': {'age_workers': {},
  'education_workers': {},
  'housing_values_workers': {'Rsquared': 0.2310224255007447,
   'coefficients': house value for 20 to 100_n     1.518013e-06
   house value for 100 to 500_n   -8.266890e-08
   house value 500 or more_n      -1.080475e-07
   dtype: float64,
   'interval':                                        min           max
   house value for 20 to 100_n   9.266440e-07  2.109382e-06
   house value for 100 to 500_n -1.640612e-07 -1.276626e-09
   house value 500 or more_n    -1.459624e-07 -7.013271e-08},
  'income_workers': {'Rsquared': 0.21029869264168022,
   'coefficients': household income form 10 to 40_n    5.030557e-07
   household income form 40 to 75_n   -6.041455e-07
   household income 75 and above_n     5.395145e-09
   dtype: float64,
   'interval':                                            min           max
   household income form 10 to 40_n  2.529882e-07  7.531233e-07
   household income form 40 to 75_n -9.810085e-07 -2.272824e

In [58]:
#DUMPING THE DICTIONARY INTO A FRAME
a = pd.concat(map(pd.DataFrame, OLS_results_final.itervalues()), keys=OLS_results_final.keys()).stack().unstack(1).unstack(1)
a.columns = a.columns.droplevel(0)

In [62]:
#CREATING SEPARATE FRAMES FOR EACH GROUP OF DEMOGRAPHIC FEATURES 
dicframe = {}
for data_group in variable_groups_workers:
    c = pd.DataFrame(np.nan, index = eval(data_group), columns = [u'coefficient'])
    dicframe[data_group] = pd.DataFrame()
    for i in range (0, 71):
        d =  a[data_group][i]
        if d is None:
            dicframe[data_group] = pd.concat([dicframe[data_group] , c], axis=1, ignore_index=True)
        else:    
            dicframe[data_group] = pd.concat([dicframe[data_group] , d], axis=1, ignore_index=True)
    dicframe[data_group] = dicframe[data_group].T   

In [64]:
#CREATING A COEFFICIENT FRAME
coefficients1 = pd.DataFrame()
for data_group in variable_groups_workers:
    coefficients1 = pd.concat([coefficients1, dicframe[data_group]], axis=1)

In [65]:
#sETTING THE FRAME'S INDEX AS THE TYPES OF COMPLAINTS
coefficients1 = coefficients1.set_index(a.index)

In [66]:
#STORING IT IN A CSV FOR VISUALIZATION
coefficients1.to_csv('OLS_workers_coefficients.csv')