# Data Analysis by Municipio

In [1]:
#Import dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as sts
from scipy.stats import linregress


### Hide warning messages in notebook

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Read csv file from Coneval - INEGI

In [3]:
municipio_pov_dataload = "Resources/municipalpoverty.csv"
municipio_df = pd.read_csv(municipio_pov_dataload)
povertyindicators_dataload = "Resources/Concentrado, indicadores de pobreza.csv"
povertyindicators_df = pd.read_csv(povertyindicators_dataload)

In [4]:
municipaldata = pd.DataFrame(municipio_df)
new = municipaldata[['clave_entidad', 'entidad_federativa', 'clave_municipio', 'municipio', 'poblacion', 'pobreza', 'pobreza_pob']].copy()
new.head()

Unnamed: 0,clave_entidad,entidad_federativa,clave_municipio,municipio,poblacion,pobreza,pobreza_pob
0,1,Aguascalientes,1001,Aguascalientes,861446,26.1,224949
1,1,Aguascalientes,1002,Asientos,46624,54.0,25169
2,1,Aguascalientes,1003,Calvillo,52773,56.8,29951
3,1,Aguascalientes,1004,Cosío,15321,43.2,6620
4,1,Aguascalientes,1005,Jesús María,125750,25.0,31479


### Filter data just from Mexico State and change head titles to English

In [5]:
mexico_state_df = new[new['entidad_federativa'].str.contains("México")]
poverty_by_municipality_df = mexico_state_df.rename(columns={"clave_entidad": "State ID",
                                    "entidad_federativa": "State", 
                                    "clave_municipio": "Municipality ID",
                                    "municipio": "Municipality",
                                    "poblacion": "Population",
                                    "pobreza": "% Poverty", 
                                    "pobreza_pob": "Total Population in Poverty"})

poverty_by_municipality_df

Unnamed: 0,State ID,State,Municipality ID,Municipality,Population,% Poverty,Total Population in Poverty
656,15,México,15001,Acambay,62863,73.5,46233
657,15,México,15002,Acolman,171774,56.2,96512
658,15,México,15003,Aculco,46275,61.9,28628
659,15,México,15004,Almoloya de Alquisiras,14115,73.4,10354
660,15,México,15005,Almoloya de Juárez,179746,59.9,107654
...,...,...,...,...,...,...,...
776,15,México,15121,Cuautitlán Izcalli,509985,27.7,141173
777,15,México,15122,Valle de Chalco Solidaridad,452170,59,266702
778,15,México,15123,Luvianos,24639,84.9,20914
779,15,México,15124,San José del Rincón,84040,76.3,64133


### Show DataFrame for the second cvs that was imported

In [6]:
povertyindicators_df

Unnamed: 0,Clave de entidad,Entidad federativa,Clave de municipio,Municipio,Población 2015*\n(leer nota al final del cuadro),Porcentaje de Pobreza\n2015,Personas Pobres\n2015,Carencias promedio de Pobreza\n2015,Porcentaje Pobreza Extrema\n2015,Personas Pobreza Extrema\n2015,...,Carencias promedio Población con al menos una carencia social\n2015,Porcentaje Población con tres o más carencias sociales\n2015,Personas Población con tres o más carencias sociales\n2015,Carencias promedio\n Población con tres o más carencias sociales 2015,Porcentaje Población con ingreso inferior a la línea de bienestar \n2015,Personas Población con ingreso inferior a la línea de bienestar\n2015,Carencias promedio Población con ingreso inferior a la línea de bienestar\n2015,Porcentaje Población con ingreso inferior a la línea de bienestar mínimo\n2015,Personas Población con ingreso inferior a la línea de bienestar mínimo\n2015,Carencias promedio Población con ingreso inferior a la línea de bienestar mínimo\n2015
0,15,México,15001,Acambay,62863,73.5,46233,2.4,19.8,12475,...,2.3,39.2,24627,3.4,75.0,47164,2.4,37.1,23336,2.6
1,15,México,15002,Acolman,171774,56.2,96512,2.0,8.6,14715,...,1.9,19.2,32987,3.3,67.9,116692,1.7,24.3,41814,2.0
2,15,México,15003,Aculco,46275,61.9,28628,2.2,10.7,4930,...,2.1,29.0,13408,3.4,63.6,29419,2.2,23.8,10993,2.4
3,15,México,15004,Almoloya de Alquisiras,14115,73.4,10354,2.3,18.7,2638,...,2.2,34.4,4858,3.4,75.2,10616,2.2,37.5,5290,2.5
4,15,México,15005,Almoloya de Juárez,179746,59.9,107654,2.3,12.6,22666,...,2.2,28.9,51862,3.4,66.2,118999,2.1,25.3,45397,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,15,México,15121,Cuautitlán Izcalli,509985,27.7,141173,1.9,2.8,14401,...,1.8,8.4,43048,3.3,43.0,219101,1.2,11.6,59110,1.6
121,15,México,15122,Valle de Chalco Solidaridad,452170,59.0,266702,2.1,9.5,42778,...,2.0,21.0,94843,3.3,71.2,321840,1.7,27.8,125851,2.0
122,15,México,15123,Luvianos,24639,84.9,20914,2.5,32.1,7911,...,2.4,44.7,11012,3.4,85.8,21145,2.5,54.4,13402,2.7
123,15,México,15124,San José del Rincón,84040,76.3,64133,2.6,22.8,19120,...,2.6,48.7,40958,3.4,76.4,64231,2.6,39.4,33153,2.7


## IMPORTANT STEP: We Filtered data by Municipality and decided to compare 3 variables for poverty measurement

In [7]:
povertyindicator = pd.DataFrame(povertyindicators_df)
new1 = povertyindicator[['Clave de municipio',
                         'Personas Rezago educativo\n2015', 
                         'Personas Carencia por acceso a los servicios de salud\n2015', 
                         'Personas Carencia por acceso a la seguridad social\n2015']].copy()

new2 = new1.rename(columns={"Clave de municipio": "Municipality ID",
                            "Personas Rezago educativo\n2015" : "People with education lag",
                            "Personas Carencia por acceso a los servicios de salud\n2015": "People without access to healthcare",
                            "Personas Carencia por acceso a la seguridad social\n2015": "People without social security"})
new2.head()


Unnamed: 0,Municipality ID,People with education lag,People without access to healthcare,People without social security
0,15001,15721,8177,55302
1,15002,19176,34621,107655
2,15003,11349,4088,39700
3,15004,3811,986,12235
4,15005,39433,22250,132743


In [8]:
new2.dtypes

Municipality ID                         int64
People with education lag              object
People without access to healthcare    object
People without social security         object
dtype: object

In [9]:
mergepoverty = pd.merge(poverty_by_municipality_df, new2, on="Municipality ID", how="right")

mergepoverty['Total Population in Poverty'] = mergepoverty['Total Population in Poverty'].str.replace(',', '')
mergepoverty['People with education lag'] = mergepoverty['People with education lag'].str.replace(',', '')
mergepoverty['People without access to healthcare'] = mergepoverty['People without access to healthcare'].str.replace(',', '')
mergepoverty['People without social security'] = mergepoverty['People without social security'].str.replace(',', '')

mergepoverty

Unnamed: 0,State ID,State,Municipality ID,Municipality,Population,% Poverty,Total Population in Poverty,People with education lag,People without access to healthcare,People without social security
0,15,México,15001,Acambay,62863,73.5,46233,15721,8177,55302
1,15,México,15002,Acolman,171774,56.2,96512,19176,34621,107655
2,15,México,15003,Aculco,46275,61.9,28628,11349,4088,39700
3,15,México,15004,Almoloya de Alquisiras,14115,73.4,10354,3811,986,12235
4,15,México,15005,Almoloya de Juárez,179746,59.9,107654,39433,22250,132743
...,...,...,...,...,...,...,...,...,...,...
120,15,México,15121,Cuautitlán Izcalli,509985,27.7,141173,34153,111653,189795
121,15,México,15122,Valle de Chalco Solidaridad,452170,59,266702,71570,146256,299228
122,15,México,15123,Luvianos,24639,84.9,20914,8604,1204,21968
123,15,México,15124,San José del Rincón,84040,76.3,64133,32693,6730,79770


## DEBUGING STEP: We run a dtypes be able to run operations with integers / floats. Every time we reordered columns into a new dataframe, we had verify and to change types.

In [10]:
mergepoverty['Total Population in Poverty'] = mergepoverty['Total Population in Poverty'].astype(float)
mergepoverty['People with education lag'] = mergepoverty['People with education lag'].astype(float)
mergepoverty['People without access to healthcare'] = mergepoverty['People without access to healthcare'].astype(float)
mergepoverty['People without social security'] = mergepoverty['People without social security'].astype(float)
mergepoverty['Municipality ID'] = mergepoverty['Municipality ID'].astype(str).str.slice(2).astype(int)
mergepoverty

Unnamed: 0,State ID,State,Municipality ID,Municipality,Population,% Poverty,Total Population in Poverty,People with education lag,People without access to healthcare,People without social security
0,15,México,1,Acambay,62863,73.5,46233.0,15721.0,8177.0,55302.0
1,15,México,2,Acolman,171774,56.2,96512.0,19176.0,34621.0,107655.0
2,15,México,3,Aculco,46275,61.9,28628.0,11349.0,4088.0,39700.0
3,15,México,4,Almoloya de Alquisiras,14115,73.4,10354.0,3811.0,986.0,12235.0
4,15,México,5,Almoloya de Juárez,179746,59.9,107654.0,39433.0,22250.0,132743.0
...,...,...,...,...,...,...,...,...,...,...
120,15,México,121,Cuautitlán Izcalli,509985,27.7,141173.0,34153.0,111653.0,189795.0
121,15,México,122,Valle de Chalco Solidaridad,452170,59,266702.0,71570.0,146256.0,299228.0
122,15,México,123,Luvianos,24639,84.9,20914.0,8604.0,1204.0,21968.0
123,15,México,124,San José del Rincón,84040,76.3,64133.0,32693.0,6730.0,79770.0


In [11]:
mergepoverty.dtypes

State ID                                 int64
State                                   object
Municipality ID                          int32
Municipality                            object
Population                              object
% Poverty                               object
Total Population in Poverty            float64
People with education lag              float64
People without access to healthcare    float64
People without social security         float64
dtype: object

In [12]:
mergepoverty_byid = mergepoverty.set_index('Municipality ID') 
mergepoverty_byid

Unnamed: 0_level_0,State ID,State,Municipality,Population,% Poverty,Total Population in Poverty,People with education lag,People without access to healthcare,People without social security
Municipality ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,15,México,Acambay,62863,73.5,46233.0,15721.0,8177.0,55302.0
2,15,México,Acolman,171774,56.2,96512.0,19176.0,34621.0,107655.0
3,15,México,Aculco,46275,61.9,28628.0,11349.0,4088.0,39700.0
4,15,México,Almoloya de Alquisiras,14115,73.4,10354.0,3811.0,986.0,12235.0
5,15,México,Almoloya de Juárez,179746,59.9,107654.0,39433.0,22250.0,132743.0
...,...,...,...,...,...,...,...,...,...
121,15,México,Cuautitlán Izcalli,509985,27.7,141173.0,34153.0,111653.0,189795.0
122,15,México,Valle de Chalco Solidaridad,452170,59,266702.0,71570.0,146256.0,299228.0
123,15,México,Luvianos,24639,84.9,20914.0,8604.0,1204.0,21968.0
124,15,México,San José del Rincón,84040,76.3,64133.0,32693.0,6730.0,79770.0


In [19]:
denue_df  = pd.read_csv('Resources/DENUE_sample.csv',encoding="ISO-8859-1")
denue_df.head()


Unnamed: 0.1,Unnamed: 0,ID,Nombre de la Unidad Económica,Razón social,Código de la clase de actividad SCIAN,Nombre de clase de la actividad,Descripcion estrato personal ocupado,Tipo de vialidad,Nombre de la vialidad,Tipo de entre vialidad 1,...,Localidad,Área geoestadí­stica básica,Manzana,NÃÂºmero de telÃÂ©fono,Correo electrÃÂ³nico,Sitio en Internet,Tipo de establecimiento,Latitud,Longitud,Fecha de incorporación al DENUE
0,174704,2511586,LONJA MERCANTIL HERNANDEZ,,461110,"Comercio al por menor en tiendas de abarrotes,...",0 a 5 personas,CALLE,NICOLÃÂS BRAVO,CALLE,...,SAN BARTOLOMÃÂ ATLATLAHUCA,216,4,7171710000.0,,,Fijo,19.069133,-99.611156,jul-10
1,540472,2363713,ESTÃÂTICA UNISEX DIANA,,812110,Salones y clÃÂ­nicas de belleza y peluquerÃÂ­as,0 a 5 personas,CALLE,FRANCISCO I. MADERO,AVENIDA,...,IXTAPALUCA,904,12,,,,Fijo,19.29068,-98.902444,dic-14
2,23013,2032449,TORTILLERÃÂA 599,,311830,ElaboraciÃÂ³n de tortillas de maÃÂ­z y molie...,0 a 5 personas,CALLE,AVENIDA SAN ÃÂNGEL,CALLE,...,CIUDAD NEZAHUALCÃÂYOTL,1289,5,,,,Fijo,19.38904,-99.015859,dic-14
3,86301,2002299,ABARROTES SIN NOMBRE,,461110,"Comercio al por menor en tiendas de abarrotes,...",0 a 5 personas,AVENIDA,PROGRESO MUNICIPAL,PRIVADA,...,TULTEPEC,335,8,,,,Fijo,19.680749,-99.136141,jul-10
4,399216,2249276,INTERNET MIP,,561432,Servicios de acceso a computadoras,0 a 5 personas,CALLE,ZUMPANGO-REYES KILÃÂMETRO 9,CALLE,...,SAN SEBASTIÃÂN,359,16,,,,Fijo,19.787891,-99.055222,dic-14


In [20]:
denue_df = denue_df [['Clave municipio', 
                      'Código Postal',
                      'Código Tipo de Industria',
                      'Descripcion estrato personal ocupado']]
denue_en = denue_df.rename(columns={"Clave municipio": "Municipality ID",
                            "Código Postal" : "Zip Code",
                            "Código Tipo de Industria": "Industry Code",
                            "Descripcion estrato personal ocupado": "Company Size"})
denue_en.head()

KeyError: "['Código Tipo de Industria'] not in index"

## IMPORTANT STEP: Merge cleaned dataframes to compare and run analysis

In [15]:
poverty_final = pd.merge(mergepoverty_byid, denue_en, on="Municipality ID", how="right")
poverty_final = poverty_final.set_index('Municipality ID') 
poverty_final

NameError: name 'denue_en' is not defined

## IMPORTANT STEP: Create plots and linear regressions testing the 3 indicators of poverty vs % Poverty to identify if they are correlated

In [None]:
population = mergepoverty.iloc[:,6]
education_lag = mergepoverty.iloc[:,7]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(population, education_lag)
mr_regress_values = population * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
education_plot = plt.scatter(mergepoverty['Total Population in Poverty'],mergepoverty['People with education lag'])
plt.plot (population, mr_regress_values, "r-")
plt.annotate(line_eq,(400000,50000),fontsize=15,color="red")
plt.xlabel("Total Population in Poverty")
plt.ylabel("People with education lag")
plt.title ("Total Poverty vs Total Education Lag")
plt.xticks(np.arange(0, 800000, step = 250000))
plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(education_plot)
plt.savefig('education_plot.png')

In [None]:
healthcare = mergepoverty.iloc[:,8]
(hl_slope, hl_intercept, hl_rvalue, hl_pvalue, hl_stderr) = linregress(population, healthcare)
hl_regress_values = population * hl_slope + hl_intercept
line_eq_hl = "y = " + str(round(hl_slope,2)) + "x + " + str(round(hl_intercept,2))
healthcare_plot = plt.scatter(mergepoverty['Total Population in Poverty'],mergepoverty['People without access to healthcare'])
plt.plot (population, hl_regress_values, "r-")
plt.annotate(line_eq,(400000,50000),fontsize=15,color="red")
plt.xlabel("Total Population in Poverty")
plt.ylabel("People without access to healthcare")
plt.title ("Total Poverty vs People without access to healthcare")
plt.xticks(np.arange(0, 800000, step = 250000))
plt.yticks(np.arange(0, 500000, step = 50000))
plt.show(healthcare_plot)

plt.savefig('healthcare_plot.png')

In [None]:
social_security = mergepoverty.iloc[:,9]
(ss_slope, ss_intercept, ss_rvalue, ss_pvalue, ss_stderr) = linregress(population, social_security)
ss_regress_values = population * ss_slope + ss_intercept
line_eq_ss = "y = " + str(round(ss_slope,2)) + "x + " + str(round(ss_intercept,2))
socialsecurity_plot = plt.scatter(mergepoverty['Total Population in Poverty'],mergepoverty['People without social security'])
plt.plot (population, ss_regress_values, "r-")
plt.annotate(line_eq,(400000,50000),fontsize=15,color="red")
plt.xlabel("Total Population in Poverty")
plt.ylabel("People without social security")
plt.title ("Total Poverty vs People without social security")
plt.xticks(np.arange(0, 800000, step = 250000))
plt.yticks(np.arange(0, 1250000, step = 250000))
plt.show(socialsecurity_plot)

plt.savefig('socialsecurity_plot.png')

In [None]:
poverty_final_v2 = poverty_final



### Tag the companies according to industry and size

In [None]:
#poverty_final_v2["industry_type"]= poverty_final_v2.loc[poverty_final_v2["Industry Code"].isin(['11','21']),'industry_type']='Primary Sector'
#poverty_final_v2["industry_type"]= poverty_final_v2.loc[poverty_final_v2["Industry Code"].isin(['22', '23', '31', '32', '33', '56', '72']),'industry_type']='Sedondary Sector'
#poverty_final_v2["industry_type"]= poverty_final_v2.loc[poverty_final_v2["Industry Code"].isin(['43', '46', '48', '49', '51', '52', '53', '54', '55', '61', '62', '71', '93']),'industry_type']='Terciary Sector'

poverty_final_v2["industry_type"]=''
poverty_final_v2.loc[poverty_final_v2["Industry Code"].isin(['11','21']),'industry_type']='Primary Sector'
poverty_final_v2.loc[poverty_final_v2["Industry Code"].isin(['22', '23', '31', '32', '33', '56', '72']),'industry_type']='Secondary Sector'
poverty_final_v2.loc[poverty_final_v2["Industry Code"].isin(['43', '46', '48', '49', '51', '52', '53', '54', '55', '61', '62', '71',' 81', '93']),'industry_type']='Tertiary Sector'


poverty_final_v2["company_size_2"]=''
poverty_final_v2.loc[poverty_final_v2["Company Size"].isin(['0 a 5 personas']),'company_size_2']='Micro'
poverty_final_v2.loc[poverty_final_v2["Company Size"].isin(['6 a 10 personas', '11 a 30 personas']),'company_size_2']='Small'
poverty_final_v2.loc[poverty_final_v2["Company Size"].isin(['31 a 50 personas', '51 a 100 personas', '101 a 250 personas']),'company_size_2']='Medium'
poverty_final_v2.loc[poverty_final_v2["Company Size"].isin(['251 y mÃ¡s personas','251 y más personas']),'company_size_2']='Big'



poverty_final_v2

### Run validations that objects were calculated ok

In [None]:
ddd = poverty_final_v2.groupby("industry_type")
ddd1 = ddd["Industry Code"].value_counts()
display(ddd1.head(50))

ddd2 = poverty_final_v2.groupby("company_size_2")
ddd3 = ddd2["Company Size"].value_counts()
display(ddd3.head(50))


### convert objects into dataframe

In [None]:
grouped_by_municipality = poverty_final_v2.groupby("Municipality")
pct_of_industry = grouped_by_municipality["industry_type"].value_counts()

Municipality_vs_ind_df = pd.DataFrame({"Industry Counts" : pct_of_industry })

Municipality_vs_ind_df_2 = pd.DataFrame(Municipality_vs_ind_df.reset_index())

Municipality_vs_ind_df_2.head()


In [None]:
Municipality_vs_ind_df_3 = Municipality_vs_ind_df_2.set_index(["industry_type","Municipality"]).unstack(level=0)
Municipality_vs_ind_df_3.head()

### Set percentages to make comparable between municipalities

In [None]:
Municipality_vs_ind_df_3 = Municipality_vs_ind_df_3.fillna(0)
Municipality_vs_ind_df_3["Total Companies"] = Municipality_vs_ind_df_3["Industry Counts"]["Primary Sector"] + Municipality_vs_ind_df_3["Industry Counts"]["Secondary Sector"] +  Municipality_vs_ind_df_3["Industry Counts"]["Tertiary Sector"]
Municipality_vs_ind_df_3["pct_Primary"] = Municipality_vs_ind_df_3["Industry Counts"]["Primary Sector"] / Municipality_vs_ind_df_3["Total Companies"]
Municipality_vs_ind_df_3["pct_Secondary"] = Municipality_vs_ind_df_3["Industry Counts"]["Secondary Sector"] / Municipality_vs_ind_df_3["Total Companies"]
Municipality_vs_ind_df_3["pct_Tertiary"] = Municipality_vs_ind_df_3["Industry Counts"]["Tertiary Sector"] / Municipality_vs_ind_df_3["Total Companies"]
Municipality_vs_ind_df_3

### Repeat same procedure on company size

In [None]:
grouped_by_municipality = poverty_final_v2.groupby("Municipality")
pct_of_size = grouped_by_municipality["company_size_2"].value_counts()

Municipality_vs_compsize_df = pd.DataFrame({"Size Counts" : pct_of_size })

Municipality_vs_compsize_df_4 = pd.DataFrame(Municipality_vs_compsize_df.reset_index())
Municipality_vs_compsize_df_4.head()

In [None]:
Municipality_vs_compsize_df_5 = Municipality_vs_compsize_df_4.set_index(["company_size_2","Municipality"]).unstack(level=0)
Municipality_vs_compsize_df_5.head()

In [None]:
Municipality_vs_compsize_df_5 = Municipality_vs_compsize_df_5.fillna(0)
Municipality_vs_compsize_df_5["Total Companies"] = Municipality_vs_compsize_df_5["Size Counts"]["Big"] + Municipality_vs_compsize_df_5["Size Counts"]["Medium"] + Municipality_vs_compsize_df_5["Size Counts"]["Micro"] + Municipality_vs_compsize_df_5["Size Counts"]["Small"]
Municipality_vs_compsize_df_5["pct_Micro"] = Municipality_vs_compsize_df_5["Size Counts"]["Micro"] / Municipality_vs_compsize_df_5["Total Companies"]
Municipality_vs_compsize_df_5["pct_Small"] = Municipality_vs_compsize_df_5["Size Counts"]["Small"] / Municipality_vs_compsize_df_5["Total Companies"]
Municipality_vs_compsize_df_5["pct_Medium"] = Municipality_vs_compsize_df_5["Size Counts"]["Medium"] / Municipality_vs_compsize_df_5["Total Companies"]
Municipality_vs_compsize_df_5["pct_Big"] = Municipality_vs_compsize_df_5["Size Counts"]["Big"] / Municipality_vs_compsize_df_5["Total Companies"]

Municipality_vs_compsize_df_5

### Poverty % was string it needed to be converted to a float

In [None]:
poverty_final_v2['% Poverty'] = poverty_final_v2['% Poverty'].astype(float)
poverty_final_v2

povertybymunicip = grouped_by_municipality["% Poverty"].mean()
povertybymunicip
Poverty_df = pd.DataFrame({"% Poverty" : povertybymunicip})
Poverty_df

### Merge the information into one dataframe

In [None]:
poverty_final_v3 = pd.merge(Municipality_vs_ind_df_3, Municipality_vs_compsize_df_5, on="Municipality", how="left")
poverty_final_v3 = pd.merge(poverty_final_v3, Poverty_df, on="Municipality", how="left")

poverty_final_v3

### Run Scatter plots and square-r for each variable

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,4]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(poverty, primary)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
primary_plot = plt.scatter(x,y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.01,30),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.01,15),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Companies in Primary Sector")
plt.title ("% of Companies in Primary Sector vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('primary_act_plot.png')

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,5]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(x, y)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
secondary_plot = plt.scatter(x,y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.4,40),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.4,30),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Companies in Secondary Sector")
plt.title ("% of Companies in Secondary Sector vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('secondary_act_plot.png')

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,6]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(x, y)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
tertiary_plot = plt.scatter(x,y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.4,40),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.4,30),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Companies in Tertiary Sector")
plt.title ("% of Companies in Tertiary Sector vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('tertiary_act_plot.png')

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,12]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(x, y)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
Micro_plot = plt.scatter(x,y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.8,40),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.8,30),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Micro Companies")
plt.title ("% of Micro Companies vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('Micro_plot.png')

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,13]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(x, y)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
Small_plot = plt.scatter(x,y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.1,40),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.1,30),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Small Companies")
plt.title ("% of Small Companies vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('Small_plot.png')

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,14]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(x, y)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
Medium_plot = plt.scatter(x, y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.01,40),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.01,30),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Medium Companies")
plt.title ("% of Medium Companies vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('Medium_plot.png')

In [None]:
y = poverty_final_v3.iloc[:,16]
x = poverty_final_v3.iloc[:,15]
(mr_slope, mr_intercept, mr_rvalue, mr_pvalue, mr_stderr) = linregress(x, y)
mr_regress_values = x * mr_slope + mr_intercept
line_eq = "y = " + str(round(mr_slope,2)) + "x + " + str(round(mr_intercept,2))
Big_plot = plt.scatter(x, y, marker="o", facecolors="blue", edgecolors="black", alpha=.50)
plt.plot (x, mr_regress_values, "r-")
plt.annotate(line_eq,(.01,60),fontsize=10,color="red")
r = "r = " + str(round(mr_rvalue,4))
plt.annotate(r,(.01,50),fontsize=10,color="red")

plt.ylabel("Total Population % in Poverty")
plt.xlabel("% of Big Companies")
plt.title ("% of Big Companies vs Poverty")
#plt.xticks(np.arange(0, 800000, step = 250000))
#plt.yticks(np.arange(0, 200000, step = 25000))


plt.show(primary_plot)
plt.savefig('Big_plot.png')

### Anova Tests by industry type

In [None]:
poverty_final_v2.boxplot("% Poverty", by="industry_type", figsize=(20, 10))

In [None]:
poverty_final_v2 = poverty_final_v2.fillna(0)
poverty_final_v2['% Poverty'] = poverty_final_v2['% Poverty'].astype(float)


group1 = poverty_final_v2[poverty_final_v2["industry_type"] == "Primary Sector"]["% Poverty"]
group2 = poverty_final_v2[poverty_final_v2["industry_type"] == "Secondary Sector"]["% Poverty"]
group3 = poverty_final_v2[poverty_final_v2["industry_type"] == "Tertiary Sector"]["% Poverty"]

In [None]:
stats.f_oneway(group1, group2, group3)

### Anova Tests by company size

In [None]:
poverty_final_v2.boxplot("% Poverty", by="company_size_2", figsize=(20, 10))

In [None]:
group1 = poverty_final_v2[poverty_final_v2["company_size_2"] == "Micro"]["% Poverty"]
group2 = poverty_final_v2[poverty_final_v2["company_size_2"] == "Small"]["% Poverty"]
group3 = poverty_final_v2[poverty_final_v2["company_size_2"] == "Medium"]["% Poverty"]
group4 = poverty_final_v2[poverty_final_v2["company_size_2"] == "Big"]["% Poverty"]

In [None]:
stats.f_oneway(group1, group2, group3, group4)