# Ingestão de dados

Aqui, pegamos os dados das diversas fontes (bases) e criamos uma base unificada com as informações disponíveis **sem processar, preencher nulos ou fazer algum tipo de data prep**

Path das bases de dados: `https://raw.githubusercontent.com/LeiteJu/TCC/main/dados/bases/`
- aqui estão os arquivos originais obtidos do IBGE, IPEA, CNIC, ...

Path da base única: `https://raw.githubusercontent.com/LeiteJu/TCC/main/dados/csv/base_unica.csv`

Na base única, manteremos apenas a sigla de cada estado, para evitar problemas com diferenças na grafia do nome dos estados (acento, til, ...)

In [None]:
# importamos as libs
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd 

In [None]:
PATH="https://raw.githubusercontent.com/LeiteJu/TCC/main/dados/"

SIGLAS = ["AC","AL","AM","AP","BA","CE",
          "DF","ES","GO","MA","MG","MS",
          "MT","PA","PB","PE","PI","PR",
          "RJ","RN","RO","RR","RS","SC",
          "SE","SP","TO"]

ESTADOS = ['Acre', 'Alagoas', 'Amazonas', 'Amapá', 'Bahia', 'Ceará',
  'Distrito Federal', 'Espírito Santo', 'Goiás', 'Maranhão',
  'Minas Gerais', 'Mato Grosso do Sul', 'Mato Grosso', 'Pará', 'Paraíba',
  'Pernambuco', 'Piauí', 'Paraná', 'Rio de Janeiro',
  'Rio Grande do Norte', 'Rondônia', 'Roraima', 'Rio Grande do Sul',
  'Santa Catarina', 'Sergipe', 'São Paulo', 'Tocantins']

MAP={'Acre':'AC', 'Alagoas':'AL', 'Amazonas':"AM", 'Amapá':'AP', 
        'Bahia':"BA", 'Ceará':"CE", 'Distrito Federal' : "DF", 
        'Espírito Santo':'ES', 'Goiás':'GO', 'Maranhão':'MA',
        'Minas Gerais' : 'MG', 'Mato Grosso do Sul': 'MS', 
        'Mato Grosso' : 'MT', 'Pará':'PA', 'Paraíba':'PB', 'Pernambuco': 'PE', 
        'Piauí':'PI', 'Paraná' : 'PR', 'Rio de Janeiro' : 'RJ',
        'Rio Grande do Norte' : 'RN', 'Rondônia' : 'RO', 'Roraima' : 'RR', 
        'Rio Grande do Sul' : 'RS', 'Santa Catarina' : 'SC', 
        'Sergipe' : 'SE', 'São Paulo' : 'SP', 'Tocantins' : 'TO'}

In [None]:
df = pd.DataFrame(SIGLAS*35*12, columns=["estados"]).sort_values("estados").reset_index(drop=True)

In [None]:
init=pd.to_datetime('011985', format='%m%Y')
end=pd.to_datetime('122019', format='%m%Y')
df['data'] = pd.Series(pd.date_range(start=init, end=end, freq='MS')).to_list() * 27

In [None]:
df.shape

(11340, 2)

In [None]:
df.head()

Unnamed: 0,estados,data
0,AC,1985-01-01
1,AC,1985-02-01
2,AC,1985-03-01
3,AC,1985-04-01
4,AC,1985-05-01


In [None]:
def search_duplicates(df):
    print(f"Todas as datas repetem 27 vezes? {(df['data'].value_counts() == 27).all()}")
    print(f"Estados unicos: {df['estados'].nunique()}")
    print(f"Shape: {df.shape}")
    print(f"Duplicados? {df.duplicated().any()}")

In [None]:
(df["data"].value_counts() == 27).all()

True

In [None]:
df["estados"].nunique()

27

# PIB dos estados a preços constantes

- Fonte: IBGE
- Link: ipeadata
- Granularidade: anual por estado

In [141]:
pib_pc = pd.read_csv(f"{PATH}bases/pib_precos_constantes.csv")

In [142]:
pib_pc.head()

Unnamed: 0,Sigla,Código,Estado,1985,1986,1987,1988,1989,1990,1991,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 38
0,AC,12,Acre,3263960.0,3313102.0,3207432.0,3325104.0,3452815.0,3579888.0,3757043.0,...,7538215.0,7825168.0,8340313.0,9109604.0,8533398.0,7929802.0,7856954.0,8073084.0,7931592.0,
1,AL,27,Alagoas,13075160.0,13471570.0,14677040.0,13580560.0,14332030.0,15487580.0,15877610.0,...,26338930.0,26726190.0,26816940.0,27488980.0,28982020.0,28391640.0,29262730.0,28975910.0,30175890.0,
2,AM,13,Amazonas,12682260.0,13572270.0,14141110.0,15471050.0,16157790.0,16160390.0,16274730.0,...,54568760.0,50950260.0,54744340.0,53037640.0,49853510.0,48155140.0,48049790.0,49996690.0,51623550.0,
3,AP,16,Amapá,2396901.0,2556622.0,2586933.0,2623083.0,2747932.0,2785524.0,2935518.0,...,8047322.0,8766986.0,9352710.0,9126904.0,8840409.0,8544281.0,8858947.0,9283840.0,9288628.0,
4,BA,29,Bahia,54172850.0,58031610.0,59998040.0,70789960.0,65788950.0,70009720.0,72158160.0,...,134535600.0,136240600.0,141818600.0,144735000.0,148121600.0,144848500.0,144544500.0,148478400.0,145933100.0,


In [143]:
pib_pc = pib_pc.drop(["Código", "Estado"], axis=1)
pib_pc = pib_pc.melt(id_vars=["Sigla"], value_vars=[f"{i}" for i in range (1985,2020)], value_name="pib_pc", var_name="data")

In [144]:
pib_pc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sigla   945 non-null    object 
 1   data    945 non-null    object 
 2   pib_pc  941 non-null    float64
dtypes: float64(1), object(2)
memory usage: 22.3+ KB


In [145]:
pib_pc["data"]=pd.to_datetime(pib_pc["data"], format="%Y")

In [146]:
pib_pc.columns=["estados", "data", "pib_pc"]

In [147]:
pib_pc.head()

Unnamed: 0,estados,data,pib_pc
0,AC,1985-01-01,3263960.0
1,AL,1985-01-01,13075160.0
2,AM,1985-01-01,12682260.0
3,AP,1985-01-01,2396901.0
4,BA,1985-01-01,54172850.0


In [148]:
pib_pc["pib_pc"] = pib_pc["pib_pc"] / 12

In [149]:
pib_pc.head()

Unnamed: 0,estados,data,pib_pc
0,AC,1985-01-01,271996.7
1,AL,1985-01-01,1089597.0
2,AM,1985-01-01,1056855.0
3,AP,1985-01-01,199741.8
4,BA,1985-01-01,4514404.0


In [150]:
df = pd.merge(df, pib_pc, how="left").fillna(method="ffill")

In [151]:
(df["data"].value_counts() == 27).all()

True

In [152]:
df["estados"].nunique()

27

In [153]:
df.shape

(11340, 3)

In [154]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 3)
Duplicados? False


# PIB do estado de mercado corrente

- Fonte: IBGE
- Link: ipeadata
- Granularidade: anual por estado

In [155]:
pib_pmc = pd.read_csv(f"{PATH}bases/pib_precos_mercado_corrente.csv")

In [156]:
pib_pmc.head()

Unnamed: 0,Sigla,Código,Estado,1985,1986,1987,1988,1989,1990,1991,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 38
0,AC,12,Acre,0.00062,0.001771,0.005193,0.038876,0.547705,15.996393,80.808201,...,8165288.0,9149380.0,10483520.0,12348990.0,12443130.0,12500000.0,12835660.0,13622060.0,13939290.0,
1,AL,27,Alagoas,0.004075,0.010037,0.033596,0.200712,2.45833,81.602266,428.900043,...,28529960.0,31248930.0,33708090.0,37264090.0,42260660.0,44754660.0,47805620.0,48892280.0,53032290.0,
2,AM,13,Amazonas,0.007241,0.022455,0.072782,0.538409,7.521879,210.045517,1015.38191,...,59108110.0,59572310.0,68812000.0,71897860.0,72694790.0,75908490.0,78497470.0,84361540.0,90725250.0,
3,AP,16,Amapá,0.000574,0.001458,0.004728,0.040189,0.764695,18.591401,96.910902,...,8716746.0,10250580.0,11756080.0,12372440.0,12890800.0,13468620.0,14472590.0,15665020.0,16324200.0,
4,BA,29,Bahia,0.025442,0.065506,0.189984,1.404576,19.22115,518.982825,2652.432271,...,145727100.0,159295900.0,178261700.0,196202800.0,215986100.0,228329300.0,236137900.0,250533800.0,256468500.0,


In [157]:
pib_pmc.drop(["Código", "Estado"], inplace=True, axis=1)
pib_pmc = pib_pmc.melt(id_vars=["Sigla"], value_vars=[f"{i}" for i in range (1985,2020)], value_name="pib_pmc", var_name="anos")

In [158]:
pib_pmc.head()

Unnamed: 0,Sigla,anos,pib_pmc
0,AC,1985,0.00062
1,AL,1985,0.004075
2,AM,1985,0.007241
3,AP,1985,0.000574
4,BA,1985,0.025442


In [159]:
pib_pmc["data"] = pd.to_datetime(pib_pc["data"], format="%Y")
pib_pmc["pib_pmc"] = pib_pmc["pib_pmc"] / 12

In [160]:
pib_pmc = pib_pmc.rename({"Sigla":"estados"}, axis=1)

In [161]:
pib_pmc = pib_pmc.drop("anos", axis=1)

In [162]:
df = pd.merge(df, pib_pmc, how="left").fillna(method="ffill")

In [163]:
df.head()

Unnamed: 0,estados,data,pib_pc,pib_pmc
0,AC,1985-01-01,271996.683911,5.2e-05
1,AC,1985-02-01,271996.683911,5.2e-05
2,AC,1985-03-01,271996.683911,5.2e-05
3,AC,1985-04-01,271996.683911,5.2e-05
4,AC,1985-05-01,271996.683911,5.2e-05


In [164]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 4)
Duplicados? False


# PIB per capita

- Fonte: IBGE
- Link: ipeadata
- Granularidade: anual por estado

In [165]:
pib_pcpt =  pd.read_csv(f"{PATH}bases/pib_per_capita.csv")
pib_pcpt.head()

Unnamed: 0,Sigla,Código,Estado,1985,1986,1987,1988,1989,1990,1991,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 38
0,AC,12,Acre,9.215645,9.076744,8.537466,8.61098,8.402114,8.504651,8.723272,...,10.099619,10.312746,10.741417,11.52967,10.620112,9.70972,9.470557,9.287253,8.993397,
1,AL,27,Alagoas,5.879142,5.925594,6.320284,5.730295,5.771642,6.12453,6.170056,...,8.379165,8.443036,8.124044,8.275502,8.674832,8.452503,8.668324,8.720275,9.041852,
2,AM,13,Amazonas,7.306192,7.546578,7.601859,8.054953,7.828067,7.616238,7.471428,...,15.421931,14.188381,14.376439,13.691574,12.658521,12.03377,11.824399,12.252257,12.455625,
3,AP,16,Amapá,10.554527,10.754808,10.423195,10.149562,9.71835,9.521564,9.716494,...,11.759778,12.549329,12.724844,12.154425,11.530783,10.92207,11.105306,11.192173,10.982957,
4,BA,29,Bahia,5.134586,5.385667,5.456052,6.312963,5.609763,5.866089,5.945315,...,9.5432,9.611099,9.426834,9.568387,9.742704,9.481745,9.419988,10.023776,9.811905,


In [166]:
pib_pcpt = pib_pcpt.drop(["Código", "Estado", 'Unnamed: 38'], axis=1)
pib_pcpt = pib_pcpt.melt(id_vars=["Sigla"], value_vars=[f"{i}" for i in range (1985,2020)], value_name="pib_pcpt", var_name="anos")

In [167]:
pib_pcpt['data'] = pd.to_datetime(pib_pcpt['anos'], format="%Y")
pib_pcpt = pib_pcpt.drop(['anos'], axis=1)

In [168]:
pib_pcpt["pib_pcpt"] = pib_pcpt["pib_pcpt"] / 12

In [169]:
pib_pcpt = pib_pcpt.rename({"Sigla":"estados"}, axis=1)

In [170]:
pib_pcpt.head()

Unnamed: 0,estados,pib_pcpt,data
0,AC,0.76797,1985-01-01
1,AL,0.489929,1985-01-01
2,AM,0.608849,1985-01-01
3,AP,0.879544,1985-01-01
4,BA,0.427882,1985-01-01


In [171]:
df = pd.merge(df, pib_pcpt, how="left").fillna(method="ffill")

In [172]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 5)
Duplicados? False


# PIB da construção civil

- Fonte: IBGE
- Link: ipeadata
- Granularidade: anual por estado

In [173]:
pib_cc =  pd.read_csv(f"{PATH}bases/pib_construcao_civil.csv")
pib_cc.head()

Unnamed: 0,Sigla,Código,Estado,1985,1986,1987,1988,1989,1990,1991,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 38
0,AC,12,Acre,207486.4,307323.2,235058.0,274841.3,278643.3,203948.6,213061.8,...,581433.7,559164.0,517569.4,680114.1,463529.1,363269.0,303200.5,308058.1,309687.4,
1,AL,27,Alagoas,1094083.0,1648612.0,1675906.0,1661463.0,1320639.0,1140776.0,873791.2,...,1958261.0,1846452.0,1859065.0,1823977.0,1458481.0,1459544.0,1293523.0,1188205.0,1097614.0,
2,AM,13,Amazonas,1502130.0,2014757.0,2168534.0,2466623.0,2529082.0,1885088.0,2017014.0,...,3148854.0,2926015.0,3405730.0,2994511.0,2430038.0,2216941.0,1823895.0,1673511.0,1760398.0,
3,AP,16,Amapá,84791.47,98438.2,95423.97,109643.0,111254.4,79659.13,84414.34,...,478230.5,649747.7,573830.1,657590.4,702421.4,476897.5,371627.9,371693.8,316135.1,
4,BA,29,Bahia,6513353.0,9173761.0,5200354.0,4989453.0,6707817.0,6753978.0,6077555.0,...,11230770.0,11094620.0,11975950.0,12223090.0,10836190.0,9365402.0,8056606.0,7452787.0,7105524.0,


In [174]:
pib_cc.drop(["Código", "Estado"], inplace=True, axis=1)
pib_cc = pib_cc.melt(id_vars=["Sigla"], value_vars=[f"{i}" for i in range (1985,2020)], value_name="pib_cc", var_name="anos")
#df["pib_cc"] = pib_cc["pib_cc"]

In [175]:
pib_cc['data'] = pd.to_datetime(pib_cc['anos'], format="%Y")
pib_cc = pib_cc.drop(['anos'], axis=1)

In [176]:
pib_cc = pib_cc.rename({'Sigla':"estados"}, axis=1)

In [177]:
pib_cc.head()

Unnamed: 0,estados,pib_cc,data
0,AC,207486.4,1985-01-01
1,AL,1094083.0,1985-01-01
2,AM,1502130.0,1985-01-01
3,AP,84791.47,1985-01-01
4,BA,6513353.0,1985-01-01


In [178]:
df = pd.merge(df, pib_cc, how="left").fillna(method="ffill")

In [179]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 6)
Duplicados? False


# População

- Fonte: IBGE
- Link: https://basedosdados.org/dataset/br-ibge-populacao
- Granularidade: anual por estado

In [180]:
pop = pd.read_csv(f"{PATH}bases/populacao.csv")

In [181]:
pop.head()

Unnamed: 0,ano,sigla_uf,populacao
0,1991,AC,417102
1,1991,AL,2512658
2,1991,AM,2102766
3,1991,AP,289035
4,1991,BA,11867336


In [182]:
pop["data"] = pd.to_datetime(pop["ano"], format="%Y")
pop=pop.drop("ano", axis=1)
pop = pop.rename({"sigla_uf":"estados"}, axis=1)

In [183]:
pop.head()

Unnamed: 0,estados,populacao,data
0,AC,417102,1991-01-01
1,AL,2512658,1991-01-01
2,AM,2102766,1991-01-01
3,AP,289035,1991-01-01
4,BA,11867336,1991-01-01


In [184]:
df = pd.merge(df, pop, how="left").fillna(method="ffill")

In [185]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 7)
Duplicados? False


# INCC

- Fonte: FGV
- Granularidade: mensal para o Brasil

In [186]:
incc = pd.read_csv(f"{PATH}bases/INCC.csv")

In [187]:
incc.head()

Unnamed: 0,Ano,janeiro,fevereiro,março,abril,maio,junho,julho,agosto,setembro,outubro,novembro,dezembro
0,1980,4.7,9.53,12.8,3.13,4.86,5.73,6.33,10.28,8.02,3.1,5.06,4.98
1,1981,4.48,14.89,8.06,3.39,3.5,2.35,1.96,12.39,5.54,1.72,3.28,3.11
2,1982,3.8,10.87,12.55,4.28,5.25,3.67,5.5,16.94,4.02,3.24,4.06,2.37
3,1983,3.86,12.65,8.29,4.07,7.16,5.05,6.65,16.87,8.91,5.06,12.05,4.94
4,1984,5.94,21.67,9.36,4.36,7.99,8.93,5.26,27.61,5.6,8.64,8.64,8.16


In [188]:
incc = pd.melt(incc, id_vars=["Ano"], value_vars=incc.columns[1:], var_name="mes", value_name="incc")

In [189]:
meses = ['janeiro', 'fevereiro', 'março', 'abril',
         'maio', 'junho', 'julho', 'agosto', 'setembro', 
         'outubro', 'novembro', 'dezembro']

def categorize (x):
    for i in range(12):
        if x == meses[i]:
            return f"{i+1:02d}"

categorize("fevereiro")
incc["ind_mes"] = incc["mes"].apply(categorize)
#incc[incc["mes"] == "janeiro"]["data"] = pd.to_datetime(f"{str(incc['Ano'])}01", format="%Y%m")

In [190]:
incc["Ano"]=incc["Ano"].astype('str')
incc["anomes"] = incc["Ano"]+incc["ind_mes"]
incc["data"] = pd.to_datetime(incc["anomes"], format="%Y%m")

In [191]:
incc =incc[["data", "incc"]]

In [192]:
df = pd.merge(df, incc, how="left", on=["data"])#.fillna(method="ffill")

In [193]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 8)
Duplicados? False


# IPCA

In [194]:
ipca = pd.read_csv(f"{PATH}bases/ipca.csv")

In [195]:
ipca.head()

Unnamed: 0,Ano,janeiro,fevereiro,março,abril,maio,junho,julho,agosto,setembro,outubro,novembro,dezembro
0,1980,6.62,11.54,18.27,24.53,31.63,38.61,46.3,53.53,60.03,75.2,86.89,99.25
1,1981,6.84,13.68,19.33,27.04,34.1,41.5,50.35,58.62,66.95,75.43,84.68,95.62
2,1982,6.97,14.06,20.58,27.68,36.18,45.86,55.13,64.39,72.75,80.42,89.96,104.79
3,1983,8.64,17.18,25.78,34.06,42.75,56.86,72.66,88.4,107.8,126.23,142.93,164.01
4,1984,9.67,20.09,30.82,43.31,56.28,72.03,88.76,106.41,130.65,154.73,181.54,215.26


In [196]:
ipca = pd.melt(ipca, id_vars=["Ano"], value_vars=ipca.columns[1:], var_name="mes", value_name="ipca")

In [197]:
ipca["ind_mes"] = ipca["mes"].apply(categorize)

In [198]:
ipca["anomes"] = ipca["Ano"].astype('str')+ipca["ind_mes"]
ipca["data"] = pd.to_datetime(ipca["anomes"], format="%Y%m")

In [199]:
ipca=ipca[["data", "ipca"]]

In [200]:
df = pd.merge(df, ipca, how="left", on=["data"])#.fillna(method="ffill")

In [201]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 9)
Duplicados? False


# IGP

In [202]:
igp = pd.read_csv(f"{PATH}bases/igp.csv").dropna(axis="columns", how="any")
igp.columns = ["data", "igp"]
igp["data"]=pd.to_datetime(igp["data"], format="%Y%m")

In [203]:
df = pd.merge(df, igp, how="left", on=["data"])

In [204]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 10)
Duplicados? False


In [205]:
igp

Unnamed: 0,data,igp
0,1944-02-01,1.45
1,1944-03-01,1.43
2,1944-04-01,1.41
3,1944-05-01,1.39
4,1944-06-01,1.37
...,...,...
939,2022-05-01,0.69
940,2022-06-01,0.62
941,2022-07-01,-0.38
942,2022-08-01,-0.55


# SELIC

In [206]:
selic = pd.read_csv(f"{PATH}bases/SELIC.csv")

In [207]:
selic.head()

Unnamed: 0,Ano,janeiro,fevereiro,março,abril,maio,junho,julho,agosto,setembro,outubro,novembro,dezembro
0,1986,,,,,,0.066482,0.083826,0.120788,0.131875,0.084264,0.117146,0.242219
1,1987,0.498423,0.899355,0.566052,0.752127,1.107117,0.792399,0.37165,0.370975,0.366568,0.430937,0.609279,0.612633
2,1988,0.778818,0.890577,0.700062,0.975415,0.858712,0.838859,1.056574,0.891064,1.11606,1.381863,1.197748,1.208201
3,1989,1.039475,0.968531,0.888536,0.575459,0.516613,1.103163,1.372702,1.329308,1.644868,1.874483,1.993559,2.511088
4,1990,2.375185,3.384407,1.440015,0.218263,0.252015,0.41945,0.589091,0.475493,0.74821,0.729572,0.90865,1.034617


In [208]:
selic = pd.melt(selic, id_vars=["Ano"], value_vars=selic.columns[1:], var_name="mes", value_name="selic")

In [209]:
selic["ind_mes"] = selic["mes"].apply(categorize)

In [210]:
selic["anomes"] = selic["Ano"].astype('str') + selic["ind_mes"]
selic["data"] = pd.to_datetime(selic["anomes"], format="%Y%m")

In [211]:
selic = selic[["data", "selic"]]

In [212]:
selic.head()

Unnamed: 0,data,selic
0,1986-01-01,
1,1987-01-01,0.498423
2,1988-01-01,0.778818
3,1989-01-01,1.039475
4,1990-01-01,2.375185


In [213]:
df = pd.merge(df, selic, how="left", on=["data"])

In [214]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 11)
Duplicados? False


# IDH

In [215]:
def map_sigla (x):
    if x == "Acre":
        return "AC"
    else:
        return "outros"

In [216]:
SIGLAS = ["AC","AL","AP","AM","BA","CE",
          "DF","ES","GO","MA","MS","MT",
          "MG","PB","PN","PA","PE","PI",
          "RJ","RN","RS","RO","RR","SC",
          "SP","SE","TO"]

## IDH Longevidade

In [217]:
idh_l = pd.read_csv(f"{PATH}bases/idh_longevidade.csv")
idh_l["estados"] = SIGLAS

In [218]:
idh_l = idh_l.melt(id_vars=["Territorialidades","estados"], value_vars=idh_l.columns[1:-1], value_name="idh_l", var_name="anos")

In [219]:
idh_l['data'] = pd.to_datetime(idh_l['anos'], format='%Y')

In [220]:
idh_l = idh_l[["data", "estados", "idh_l"]]

In [221]:
idh_l.head()

Unnamed: 0,data,estados,idh_l
0,1991-01-01,AC,0.645
1,1991-01-01,AL,0.552
2,1991-01-01,AP,0.668
3,1991-01-01,AM,0.645
4,1991-01-01,BA,0.582


In [222]:
df=pd.merge(df, idh_l, how="left", on=["data", "estados"])
df["idh_l"] = df["idh_l"].fillna(method="ffill")

In [223]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 12)
Duplicados? False


## IDH Renda

In [224]:
idh_r = pd.read_csv(f"{PATH}bases/idh_renda.csv")
idh_r["estados"] = ["BR"] + SIGLAS
idh_r.drop("Territorialidades", axis=1, inplace=True)

In [225]:
idh_r=idh_r.melt(id_vars=["estados"], value_vars=idh_r.columns[:-1], var_name="Anos", value_name="idh_r") 

In [226]:
idh_r['data'] = pd.to_datetime(idh_r['Anos'], format="%Y")
idh_r.drop('Anos', axis=1, inplace=True)

In [227]:
df = pd.merge(df, idh_r, how="left", on=["data", "estados"])
df["idh_r"] = df["idh_r"].fillna(method="ffill")

In [228]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 13)
Duplicados? False


## IDH Educação

In [229]:
idh_e = pd.read_csv(f"{PATH}bases/idh_educacao.csv")
idh_e["estados"] = SIGLAS
idh_e.drop("Territorialidades", axis=1, inplace=True)

In [230]:
idh_e=idh_e.melt(id_vars=["estados"], value_vars=idh_e.columns[:-1], var_name="Anos", value_name="idh_e") 

In [231]:
idh_e['data'] = pd.to_datetime(idh_e['Anos'], format="%Y")
idh_e.drop('Anos', axis=1, inplace=True)

In [232]:
df = pd.merge(df, idh_e, how="left", on=["data", "estados"])
df['idh_e'] = df['idh_e'].fillna(method="ffill")

In [233]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 14)
Duplicados? False


# NFSP

- Fonte: BACEN
- Granularidade: mensal para o Brasil

In [234]:
nfsp = pd.read_csv(PATH+"bases/nfsp.csv", names=["data", "nfsp"], skiprows=1)
nfsp=nfsp.dropna()

In [235]:
nfsp.head()

Unnamed: 0,data,nfsp
12,1991.01,11.17
13,1991.02,11.17
14,1991.03,11.17
15,1991.04,11.17
16,1991.05,11.17


In [236]:
nfsp["data"] = nfsp["data"] * 100

In [237]:
nfsp['data'] = pd.to_datetime(nfsp['data'], format="%Y%m")

In [238]:
df = pd.merge(df, nfsp, how="left", on=["data"])

In [239]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 15)
Duplicados? False


# Estoque

- Fonte: IPEA
- Granularidade: mensal para o Brasil

In [240]:
estoque = pd.read_csv(PATH+"bases/estoque.csv")

In [241]:
estoque.columns=["data", 'estoque', 'trash']
estoque.drop('trash', axis=1, inplace=True)
estoque['data'] = pd.to_datetime(estoque['data'], format="%Y")

In [242]:
estoque

Unnamed: 0,data,estoque
0,1947-01-01,1.906553e+05
1,1948-01-01,2.025084e+05
2,1949-01-01,2.173332e+05
3,1950-01-01,2.328778e+05
4,1951-01-01,2.552773e+05
...,...,...
68,2015-01-01,7.751228e+06
69,2016-01-01,7.806242e+06
70,2017-01-01,7.820758e+06
71,2018-01-01,7.813724e+06


In [243]:
df = pd.merge(df, estoque, how="left", on=["data"])
df["estoque"] = df["estoque"].fillna(method="ffill")

In [244]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 16)
Duplicados? False


# Desemprego

In [245]:
# de 1992 a 2019
init=pd.to_datetime('011992', format='%m%Y')
end=pd.to_datetime('122019', format='%m%Y')
desemprego_geral=pd.DataFrame(SIGLAS * 336, columns=["estados"]).sort_values("estados").reset_index(drop=True)
desemprego_geral["data"] = pd.Series(pd.date_range(start=init, end=end, freq='MS')).to_list() * 27

In [246]:
init=pd.to_datetime('011992', format='%m%Y')
end=pd.to_datetime('022012', format='%m%Y')
desemprego_geral_anual=pd.DataFrame(SIGLAS * 242, columns=["estados"]).sort_values("estados").reset_index(drop=True)
desemprego_geral_anual["data"] = pd.Series(pd.date_range(start=init, end=end, freq='MS')).to_list() * 27

In [247]:
desemprego_anual =pd.read_csv(PATH+"bases/desemprego_b2015.csv", sep=";", skiprows=1, decimal=",")
desemprego_anual = desemprego_anual.drop(["Código", "Estado","Unnamed: 23"], axis=1)

In [248]:
desemprego_anual.head()

Unnamed: 0,Sigla,1992,1993,1995,1996,1997,1998,1999,2001,2002,...,2004,2005,2006,2007,2008,2009,2011,2012,2013,2014
0,AC,9.216314,10.05649,9.80071,8.437029,8.887605,8.53551,12.203437,8.558123,6.582788,...,10.247832,11.327006,8.581848,5.786489,7.743653,7.556724,5.357073,8.002315,9.56258,9.808001
1,AL,8.980849,11.086582,8.312264,8.011287,7.972121,12.287512,14.748984,12.065698,9.080515,...,10.209505,9.775723,10.637167,8.359978,8.005988,12.126772,10.407524,12.241125,13.191182,13.133901
2,AM,11.55573,16.032611,11.311703,10.816192,15.131507,18.481736,19.326194,10.684138,12.966133,...,12.448917,12.828997,9.430801,12.23877,10.012385,12.44182,9.27554,8.995633,10.19457,10.835797
3,AP,5.92294,8.304413,13.067481,7.528157,10.932264,10.538872,15.278558,19.057839,20.539464,...,14.564569,11.961929,6.666667,16.331604,15.193255,13.756124,14.214812,10.91379,13.103432,10.325027
4,BA,8.211435,7.681225,7.45973,8.524951,8.366323,8.734137,9.92229,10.603811,10.735432,...,11.530353,11.071503,10.248704,10.34383,10.533664,10.669944,10.509659,10.181658,9.865157,9.984575


In [249]:
desemprego_anual = desemprego_anual.melt(id_vars=["Sigla"], value_vars=desemprego_anual[1:], value_name="desemprego", var_name="data")

In [250]:
desemprego_anual

Unnamed: 0,Sigla,data,desemprego
0,AC,1992,9.216314
1,AL,1992,8.980849
2,AM,1992,11.555730
3,AP,1992,5.922940
4,BA,1992,8.211435
...,...,...,...
535,RS,2014,5.004448
536,SC,2014,3.133842
537,SE,2014,8.652161
538,SP,2014,7.896507


In [251]:
desemprego_anual['data'] = pd.to_datetime(desemprego_anual['data'], format="%Y")

In [252]:
desemprego_anual

Unnamed: 0,Sigla,data,desemprego
0,AC,1992-01-01,9.216314
1,AL,1992-01-01,8.980849
2,AM,1992-01-01,11.555730
3,AP,1992-01-01,5.922940
4,BA,1992-01-01,8.211435
...,...,...,...
535,RS,2014-01-01,5.004448
536,SC,2014-01-01,3.133842
537,SE,2014-01-01,8.652161
538,SP,2014-01-01,7.896507


In [253]:
desemprego_anual=desemprego_anual.rename({"Sigla":"estados"}, axis=1)

In [254]:
desemprego_geral_anual=pd.merge(desemprego_geral_anual, desemprego_anual, how="left", on=["data", "estados"]).fillna(method="ffill")

In [255]:
desemprego_geral_anual

Unnamed: 0,estados,data,desemprego
0,AC,1992-01-01,9.216314
1,AC,1992-02-01,9.216314
2,AC,1992-03-01,9.216314
3,AC,1992-04-01,9.216314
4,AC,1992-05-01,9.216314
...,...,...,...
6529,TO,2011-10-01,7.328652
6530,TO,2011-11-01,7.328652
6531,TO,2011-12-01,7.328652
6532,TO,2012-01-01,6.470417


In [256]:
desemprego_mensal = pd.read_csv(PATH+"bases/desemprego_a2015.csv", sep=";", skiprows=1, decimal=",")
desemprego_mensal=desemprego_mensal.dropna()

In [257]:
desemprego_mensal.head()

Unnamed: 0,Unnamed: 1,jan-fev-mar 2012,fev-mar-abr 2012,mar-abr-mai 2012,abr-mai-jun 2012,mai-jun-jul 2012,jun-jul-ago 2012,jul-ago-set 2012,ago-set-out 2012,set-out-nov 2012,...,ago-set-out 2021,set-out-nov 2021,out-nov-dez 2021,nov-dez-jan 2022,dez-jan-fev 2022,jan-fev-mar 2022,fev-mar-abr 2022,mar-abr-mai 2022,abr-mai-jun 2022,mai-jun-jul 2022
0,Brasil,8.0,7.8,7.7,7.6,7.5,7.4,7.1,7.0,6.8,...,12.1,11.6,11.1,11.2,11.2,11.1,10.5,9.8,9.3,9.1


In [258]:
desemprego_mensal = desemprego_mensal.melt(id_vars=[], value_vars=desemprego_mensal[:], value_name="desemprego", var_name="data")

In [259]:
desemprego_mensal.head()

Unnamed: 0,data,desemprego
0,,Brasil
1,jan-fev-mar 2012,8.0
2,fev-mar-abr 2012,7.8
3,mar-abr-mai 2012,7.7
4,abr-mai-jun 2012,7.6


In [260]:
desemprego_mensal=desemprego_mensal.iloc[1:]

In [261]:
desemprego_mensal.tail()

Unnamed: 0,data,desemprego
121,jan-fev-mar 2022,11.1
122,fev-mar-abr 2022,10.5
123,mar-abr-mai 2022,9.8
124,abr-mai-jun 2022,9.3
125,mai-jun-jul 2022,9.1


In [262]:
init = pd.to_datetime("032012", format='%m%Y')
end = pd.to_datetime("072022", format='%m%Y')
desemprego_mensal['data'] = pd.date_range(start=init, end=end, freq='MS')

In [263]:
desemprego_mensal.tail()

Unnamed: 0,data,desemprego
121,2022-03-01,11.1
122,2022-04-01,10.5
123,2022-05-01,9.8
124,2022-06-01,9.3
125,2022-07-01,9.1


In [264]:
desemprego_geral = pd.merge(desemprego_geral,desemprego_geral_anual, how="left", on=["estados", "data"])

In [265]:
desemprego_geral

Unnamed: 0,estados,data,desemprego
0,AC,1992-01-01,9.216314
1,AC,1992-02-01,9.216314
2,AC,1992-03-01,9.216314
3,AC,1992-04-01,9.216314
4,AC,1992-05-01,9.216314
...,...,...,...
9067,TO,2019-08-01,
9068,TO,2019-09-01,
9069,TO,2019-10-01,
9070,TO,2019-11-01,


In [266]:
desemprego_geral = pd.merge(desemprego_geral,desemprego_mensal, how="left", on=["data"]).fillna(0)

In [267]:
desemprego_geral.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9072 entries, 0 to 9071
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   estados       9072 non-null   object        
 1   data          9072 non-null   datetime64[ns]
 2   desemprego_x  9072 non-null   float64       
 3   desemprego_y  9072 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 354.4+ KB


In [268]:
desemprego_geral["desemprego_y"] = desemprego_geral["desemprego_y"].astype("float")

In [269]:
desemprego_geral["desemprego"] = desemprego_geral["desemprego_x"] + desemprego_geral["desemprego_y"]

In [270]:
desemprego_geral.head()

Unnamed: 0,estados,data,desemprego_x,desemprego_y,desemprego
0,AC,1992-01-01,9.216314,0.0,9.216314
1,AC,1992-02-01,9.216314,0.0,9.216314
2,AC,1992-03-01,9.216314,0.0,9.216314
3,AC,1992-04-01,9.216314,0.0,9.216314
4,AC,1992-05-01,9.216314,0.0,9.216314


In [271]:
desemprego_geral=desemprego_geral.drop(["desemprego_x","desemprego_y"],axis=1)

In [272]:
desemprego_geral.shape

(9072, 3)

In [273]:
desemprego_geral[desemprego_geral['estados'] == "TO"].sort_values("data").drop_duplicates()

Unnamed: 0,estados,data,desemprego
8736,TO,1992-01-01,6.100175
8737,TO,1992-02-01,6.100175
8738,TO,1992-03-01,6.100175
8739,TO,1992-04-01,6.100175
8740,TO,1992-05-01,6.100175
...,...,...,...
9067,TO,2019-08-01,11.900000
9068,TO,2019-09-01,11.900000
9069,TO,2019-10-01,11.800000
9070,TO,2019-11-01,11.300000


In [274]:
df = pd.merge(df, desemprego_geral.drop_duplicates(), how="left", on=["estados", "data"])

In [275]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 17)
Duplicados? False


## Produção

In [276]:
REGIONS = [
    "REGIÃO NORTE",
    "REGIÃO  NORDESTE",
    "REGIÃO SUDESTE",
    "REGIÃO  SUL",
    "CENTRO-OESTE",
    "TOTAL  BRASIL",
]

STATES = [
    "ACRE",
    "ALAGOAS",
    "AMAPÁ",
    "AMAZONAS",
    "BAHIA",
    "CEARÁ",
    #"CENTRO-OESTE",
    "DISTRITO FEDERAL",
    "ESPÍRITO SANTO",
    "GOIÁS",
    "MARANHÃO",
    "MATO GROSSO",
    "MATO GROSSO DO SUL",
    "MINAS GERAIS",
    "PARANÁ",
    "PARAÍBA",
    "PARÁ",
    "PERNAMBUCO",
    "PIAUÍ",
    #"REGIÃO  NORDESTE",
    #"REGIÃO  SUL",
    #"REGIÃO NORTE",
    #"REGIÃO SUDESTE",
    "RIO DE JANEIRO",
    "RIO GRANDE DO NORTE",
    "RIO GRANDE DO SUL",
    "RONDÔNIA",
    "RORAIMA",
    "SANTA CATARINA",
    "SERGIPE",
    "SÃO PAULO",
    "TOCANTINS",
]

STATE_INITIALS = [
    "AC",
    "AL",
    "AM",
    "AP",
    "BA",
    "CE",
    "DF",
    "ES",
    "GO",
    "MA",
    "MG",
    "MS",
    "MT",
    "PA",
    "PB",
    "PE",
    "PI",
    "PR",
    "RJ",
    "RN",
    "RO",
    "RR",
    "RS",
    "SC",
    "SE",
    "SP",
    "TO",
]

MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [277]:
import numpy as np

In [278]:
def read_tab_and_preprocess(tab):
    # Lê a aba 'tab' do excel e ignore as 4 primeiras linhas onde se encontra os cabeçalhos
    df = pd.read_excel(xls, tab, skiprows=4)
    df = df.rename({"Unnamed: 0": "Estado", "e ESTADOS": "Estado", "e REGIÕES": "Estado"}, axis=1)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df = df.replace({"...": np.nan})
    df = df.replace({"....": np.nan})
    df = df.replace({"-": np.nan})
    
    df.head()
   
    # Remove total por região
    df["STATES_KEEP"] = df["Estado"].apply(lambda x: x not in REGIONS)
    df = df[df["STATES_KEEP"]]
    
    # Remove outros valores que não são dos estados e.g. valor de ajuste
    df["STATES_KEEP"] = df["Estado"].apply(lambda x: x in STATES)
    df = df[df["STATES_KEEP"]]
    df = df.drop(["STATES_KEEP"], axis=1)
    df = df.drop(["TOTAL"], axis=1)
    
    # Ordena todo o DF pelo nome do estado
    df = df.sort_values(by="Estado")
    df = df.infer_objects()
    df = df.reset_index(drop=True)
    
    # Troca nome do estado por suas iniciais
    state_to_initials = dict(zip(STATES, STATE_INITIALS))
    df["Estado"] = df["Estado"].apply(lambda x: state_to_initials[x])
    return df

In [279]:
def transpose_df(df):
    # Cria Dataframe com duas colunas contendo estados e ano-mês
    data = data = [year + month for month in MONTHS]
    estados = df["Estado"].values
    datas = np.array([[d] * len(estados) for d in data]).flatten()
    estados = np.array([estados] * len(data)).flatten()
    df_tmp = pd.DataFrame({"data": datas, "estados": estados})
    df_tmp["data"] = df_tmp["data"].astype("int")
    
    # Extraí os valores de produção do cimento que estão horizontais
    # e os coloca na posição vertocal no novo dataframe
    producao = []
    for i in range(df.shape[0]):
        producao.append(df.drop("Estado", axis=1).T[i].values)
    producao = np.array(producao).reshape(-1, 1).squeeze()
    df_tmp["producao"] = producao
    producao = np.array(producao).flatten()
    df_tmp["producao"] = producao
    
    return df_tmp

In [280]:
xls = pd.ExcelFile("../dados/bases/producao.xlsx", engine="openpyxl")

In [281]:
# Converte todos as abas de anos do excel em dataframes
# Empilha todos os dataframes
# Salva no formato .csv
dataframes = []
for year in xls.sheet_names:
    prod = read_tab_and_preprocess(year)
    prod = transpose_df(prod)
    dataframes.append(prod)

prod = pd.concat(dataframes)
prod = prod.reset_index(drop=True)
prod.to_csv("producao_mensal_cimento_2003_2022.csv", index=False)

In [282]:
prod["data"] = pd.to_datetime(prod["data"], format='%Y%m')

In [283]:
# prod["producao"] = prod["producao"].fillna(-999)

In [284]:
prod.head(15)

Unnamed: 0,data,estados,producao
0,2003-01-01,AC,
1,2003-01-01,AL,
2,2003-01-01,AM,
3,2003-01-01,AP,
4,2003-01-01,BA,
5,2003-01-01,CE,
6,2003-01-01,DF,
7,2003-01-01,ES,
8,2003-01-01,GO,
9,2003-01-01,MA,


In [285]:
df = pd.merge(df, prod, how="left", on=["data", "estados"])

## Preços

In [286]:
mes = ['JAN', 'FEV', 'MAR', 'ABR', 'MAI', 'JUN', 'JUL', 'AGO', 'SET',
       'OUT', 'NOV', 'DEZ']
ind = [f"{i+1:02d}" for i in range(12)]
map_mes = dict(zip(mes, ind))

### Preços tonelada

In [287]:
xls = pd.ExcelFile("../dados/bases/preco_ton.xlsx", engine="openpyxl")

In [288]:
xls.sheet_names

['tabela_07.A.07']

In [289]:
preco_ton = pd.read_excel(xls, skiprows=4)

In [290]:
preco_ton

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,AL,AM,BA,CE,DF,ES,GO,MA,...,PB,PE,PR,RJ,RO,RS,SC,SE,SP,Brasil
0,2000,JAN,88.706548,110.883185,94.250707,94.250707,105.339025,99.794866,116.427344,110.883185,...,83.162388,94.250707,94.250707,99.794866,138.603981,110.883185,121.971503,88.706548,99.794866,102.69895
1,,FEV,90.125613,112.657016,98.574889,95.758463,107.024165,100.264744,112.657016,112.657016,...,93.505323,97.448319,98.236918,101.391314,140.821270,112.657016,123.922717,95.758463,101.503971,104.787118
2,,MAR,91.848450,114.810563,100.459242,97.588978,109.070034,103.329506,120.551091,109.070034,...,96.440873,97.014925,100.574053,103.329506,149.253731,120.551091,126.291619,97.588978,105.281286,107.446285
3,,ABR,90.487501,113.109377,98.970705,96.142970,107.453908,100.667345,113.109377,101.798439,...,95.011876,95.577423,99.536252,101.798439,147.042190,118.764846,124.420314,96.142970,103.834408,105.557979
4,,MAI,82.061382,114.885935,96.613600,93.002900,103.944417,97.379507,114.885935,103.944417,...,89.720444,92.455824,98.254828,98.473658,142.239729,109.415176,125.827452,93.002900,101.099623,102.433446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,,DEZ,,,,,,,,,...,,,,,,,,,,...
276,Fonte: Sinduscons Estaduais e Banco de Dados-C...,,,,,,,,,,...,,,,,,,,,,
277,Elaboração: Banco de Dados-CBIC,,,,,,,,,,...,,,,,,,,,,
278,(*) Taxa de Câmbio utilizada: Média mensal.,,,,,,,,,,...,,,,,,,,,,


In [291]:
preco_ton = preco_ton.rename({'Unnamed: 0': 'ano', 
              'Unnamed: 1': 'mes'},
            axis=1)

preco_ton = preco_ton.drop("Brasil", axis=1)

In [292]:
preco_ton.head(10)

Unnamed: 0,ano,mes,AL,AM,BA,CE,DF,ES,GO,MA,...,PA,PB,PE,PR,RJ,RO,RS,SC,SE,SP
0,2000.0,JAN,88.706548,110.883185,94.250707,94.250707,105.339025,99.794866,116.427344,110.883185,...,66.529911,83.162388,94.250707,94.250707,99.794866,138.603981,110.883185,121.971503,88.706548,99.794866
1,,FEV,90.125613,112.657016,98.574889,95.758463,107.024165,100.264744,112.657016,112.657016,...,67.594209,93.505323,97.448319,98.236918,101.391314,140.82127,112.657016,123.922717,95.758463,101.503971
2,,MAR,91.84845,114.810563,100.459242,97.588978,109.070034,103.329506,120.551091,109.070034,...,68.886338,96.440873,97.014925,100.574053,103.329506,149.253731,120.551091,126.291619,97.588978,105.281286
3,,ABR,90.487501,113.109377,98.970705,96.14297,107.453908,100.667345,113.109377,101.798439,...,67.865626,95.011876,95.577423,99.536252,101.798439,147.04219,118.764846,124.420314,96.14297,103.834408
4,,MAI,82.061382,114.885935,96.6136,93.0029,103.944417,97.379507,114.885935,103.944417,...,65.649106,89.720444,92.455824,98.254828,98.473658,142.239729,109.415176,125.827452,93.0029,101.099623
5,,JUN,88.480894,116.131173,94.01095,94.01095,110.601117,98.434994,110.601117,105.071061,...,66.36067,88.480894,94.01095,100.536415,99.541005,143.781452,116.131173,127.191285,94.01095,102.637837
6,,JUL,88.997664,116.809434,97.89743,94.560018,111.24708,100.678607,111.24708,111.24708,...,66.748248,88.997664,95.672489,106.797197,100.122372,139.05885,116.809434,127.934142,94.560018,103.904772
7,,AGO,99.491488,116.073403,101.481318,99.491488,110.546098,100.818041,110.546098,116.073403,...,66.327659,96.175105,102.807871,106.124254,99.491488,138.182622,116.073403,127.128012,93.964183,103.028963
8,,SET,97.868639,119.617225,99.826011,97.868639,108.742932,99.173554,108.742932,114.180078,...,65.245759,94.606351,100.587212,104.393214,97.868639,141.365811,114.180078,125.054371,97.868639,101.348412
9,,OUT,95.765056,117.04618,101.085337,95.765056,106.405618,95.765056,106.405618,111.725899,...,63.843371,95.765056,101.085337,101.085337,95.765056,138.327304,111.725899,122.366461,95.765056,101.085337


In [293]:
preco_ton['ano'] = preco_ton['ano'].fillna(method="ffill")

In [294]:
preco_ton.head(10)

Unnamed: 0,ano,mes,AL,AM,BA,CE,DF,ES,GO,MA,...,PA,PB,PE,PR,RJ,RO,RS,SC,SE,SP
0,2000,JAN,88.706548,110.883185,94.250707,94.250707,105.339025,99.794866,116.427344,110.883185,...,66.529911,83.162388,94.250707,94.250707,99.794866,138.603981,110.883185,121.971503,88.706548,99.794866
1,2000,FEV,90.125613,112.657016,98.574889,95.758463,107.024165,100.264744,112.657016,112.657016,...,67.594209,93.505323,97.448319,98.236918,101.391314,140.82127,112.657016,123.922717,95.758463,101.503971
2,2000,MAR,91.84845,114.810563,100.459242,97.588978,109.070034,103.329506,120.551091,109.070034,...,68.886338,96.440873,97.014925,100.574053,103.329506,149.253731,120.551091,126.291619,97.588978,105.281286
3,2000,ABR,90.487501,113.109377,98.970705,96.14297,107.453908,100.667345,113.109377,101.798439,...,67.865626,95.011876,95.577423,99.536252,101.798439,147.04219,118.764846,124.420314,96.14297,103.834408
4,2000,MAI,82.061382,114.885935,96.6136,93.0029,103.944417,97.379507,114.885935,103.944417,...,65.649106,89.720444,92.455824,98.254828,98.473658,142.239729,109.415176,125.827452,93.0029,101.099623
5,2000,JUN,88.480894,116.131173,94.01095,94.01095,110.601117,98.434994,110.601117,105.071061,...,66.36067,88.480894,94.01095,100.536415,99.541005,143.781452,116.131173,127.191285,94.01095,102.637837
6,2000,JUL,88.997664,116.809434,97.89743,94.560018,111.24708,100.678607,111.24708,111.24708,...,66.748248,88.997664,95.672489,106.797197,100.122372,139.05885,116.809434,127.934142,94.560018,103.904772
7,2000,AGO,99.491488,116.073403,101.481318,99.491488,110.546098,100.818041,110.546098,116.073403,...,66.327659,96.175105,102.807871,106.124254,99.491488,138.182622,116.073403,127.128012,93.964183,103.028963
8,2000,SET,97.868639,119.617225,99.826011,97.868639,108.742932,99.173554,108.742932,114.180078,...,65.245759,94.606351,100.587212,104.393214,97.868639,141.365811,114.180078,125.054371,97.868639,101.348412
9,2000,OUT,95.765056,117.04618,101.085337,95.765056,106.405618,95.765056,106.405618,111.725899,...,63.843371,95.765056,101.085337,101.085337,95.765056,138.327304,111.725899,122.366461,95.765056,101.085337


In [295]:
preco_ton.tail()

Unnamed: 0,ano,mes,AL,AM,BA,CE,DF,ES,GO,MA,...,PA,PB,PE,PR,RJ,RO,RS,SC,SE,SP
275,2022,DEZ,,,,,,,,,...,,,,,,,,,,
276,Fonte: Sinduscons Estaduais e Banco de Dados-C...,,,,,,,,,,...,,,,,,,,,,
277,Elaboração: Banco de Dados-CBIC,,,,,,,,,,...,,,,,,,,,,
278,(*) Taxa de Câmbio utilizada: Média mensal.,,,,,,,,,,...,,,,,,,,,,
279,(...) Dado não Disponível,,,,,,,,,,...,,,,,,,,,,


In [296]:
preco_ton = preco_ton.iloc[:-4]

In [297]:
preco_ton = preco_ton[preco_ton["ano"] < 2020]

In [298]:
def convert_months (x):

    return map_mes[x]

In [299]:
preco_ton["mes"] = preco_ton["mes"].apply(convert_months)

In [300]:
preco_ton["data"] = pd.to_datetime(preco_ton["mes"].astype('int64') + preco_ton["ano"] * 100 , format='%Y%m')

In [301]:
preco_ton = preco_ton.drop(["ano", 'mes'], axis=1)

In [302]:
preco_ton.columns

Index(['AL', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS', 'MT', 'PA',
       'PB', 'PE', 'PR', 'RJ', 'RO', 'RS', 'SC', 'SE', 'SP', 'data'],
      dtype='object')

In [303]:
preco_ton = pd.melt(preco_ton, id_vars=["data"], value_vars=preco_ton.columns[:-1], var_name='estados', value_name='preco_ton')

In [304]:
preco_ton['preco_ton'] = preco_ton['preco_ton'].replace({"...": np.nan})

In [305]:
df = pd.merge(df, preco_ton, how="left", on=["data", "estados"])

### Preço saco

In [307]:
xls = pd.ExcelFile("../dados/bases/preco_saco.xlsx", engine="openpyxl")

In [308]:
xls.sheet_names

['tabela_07.A.06']

In [309]:
preco_saco = pd.read_excel(xls, skiprows=3)

In [310]:
preco_saco.head(10)

Unnamed: 0,ANO/MÊS,Unnamed: 1,UNIDADES DA FEDERAÇÃO,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Média
0,,,AL,AM,BA,CE,DF,ES,GO,MA,...,PB,PE,PR,RJ,RO,RS,SC,SE,SP,Brasil
1,1994.0,JUN,...,6.834909,6.4,6.6,5.204727,5.1,5.627273,7,...,6.5,6.536727,5.5,5.6,7.041636,5.725455,6,...,4.699091,5.880201
2,,JUL,...,7,6.5,6.5,6,5,5.5,7,...,6.5,6.5,5.5,5.5,8.5,5.5,6,...,5,6.105263
3,,AGO,...,6.5,6,6.5,6,5,5.5,6.5,...,6.5,6.5,5.5,5.5,8.5,5.5,6,...,5,5.973684
4,,SET,...,6.5,6.5,6.5,5.5,5,5.5,6.5,...,6.5,6.5,5.5,5.5,8.5,5.5,5.5,...,5,5.947368
5,,OUT,...,6.5,5.5,6.5,5.5,5,6,6.5,...,6.5,6,5,5.5,8.5,5.5,6,...,5,5.894737
6,,NOV,...,6.5,5.5,6.5,5.5,5,6,7,...,6.5,6.5,5,5.5,8,5.5,6,...,5,5.921053
7,,DEZ,...,7,5.5,6.5,5.5,5,5.5,7,...,6.5,6.5,5,5.5,7.5,5.5,6,...,5,5.921053
8,1995.0,JAN,...,7,5.5,6.5,5.5,5,5.5,7,...,6.5,6.5,5.5,5.5,7.5,5.5,5.5,...,5,5.894737
9,,FEV,...,7,5.5,6.5,5.5,5,5.5,7,...,6.5,6.5,5,5.5,8,5.5,5.5,...,5,5.921053


In [311]:
preco_saco = preco_saco.rename({'ANO/MÊS':'ano', 'Unnamed: 1':'mes'}, axis=1)

In [312]:
preco_saco.iloc[0].values

array([nan, nan, 'AL', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG',
       'MS', 'MT', 'PA', 'PB', 'PE', 'PR', 'RJ', 'RO', 'RS', 'SC', 'SE',
       'SP', 'Brasil'], dtype=object)

In [313]:
preco_saco.columns=['ano', 'mes'] + list(preco_saco.iloc[0].values[2:])
preco_saco = preco_saco.iloc[1:]

In [314]:
preco_saco = preco_saco.iloc[:-3]

In [315]:
preco_saco

Unnamed: 0,ano,mes,AL,AM,BA,CE,DF,ES,GO,MA,...,PB,PE,PR,RJ,RO,RS,SC,SE,SP,Brasil
1,1994,JUN,...,6.834909,6.4,6.6,5.204727,5.1,5.627273,7,...,6.5,6.536727,5.5,5.6,7.041636,5.725455,6,...,4.699091,5.880201
2,,JUL,...,7,6.5,6.5,6,5,5.5,7,...,6.5,6.5,5.5,5.5,8.5,5.5,6,...,5,6.105263
3,,AGO,...,6.5,6,6.5,6,5,5.5,6.5,...,6.5,6.5,5.5,5.5,8.5,5.5,6,...,5,5.973684
4,,SET,...,6.5,6.5,6.5,5.5,5,5.5,6.5,...,6.5,6.5,5.5,5.5,8.5,5.5,5.5,...,5,5.947368
5,,OUT,...,6.5,5.5,6.5,5.5,5,6,6.5,...,6.5,6,5,5.5,8.5,5.5,6,...,5,5.894737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,,AGO,32.5,39.75,33.74,33,31.9,29.78,31.65,34,...,26.85,34.5,33,34,48.5,37,31.6,37,35.62,...
340,,SET,,,,,,,,,...,,,,,,,,,,...
341,,OUT,,,,,,,,,...,,,,,,,,,,...
342,,NOV,,,,,,,,,...,,,,,,,,,,...


In [316]:
preco_saco['ano'] = preco_saco['ano'].fillna(method='ffill')

In [317]:
preco_saco = preco_saco.replace(f"...", np.nan)

In [318]:
preco_saco

Unnamed: 0,ano,mes,AL,AM,BA,CE,DF,ES,GO,MA,...,PB,PE,PR,RJ,RO,RS,SC,SE,SP,Brasil
1,1994,JUN,,6.834909,6.40,6.6,5.204727,5.10,5.627273,7.0,...,6.50,6.536727,5.5,5.6,7.041636,5.725455,6.0,,4.699091,5.880201
2,1994,JUL,,7.000000,6.50,6.5,6.000000,5.00,5.500000,7.0,...,6.50,6.500000,5.5,5.5,8.500000,5.500000,6.0,,5.000000,6.105263
3,1994,AGO,,6.500000,6.00,6.5,6.000000,5.00,5.500000,6.5,...,6.50,6.500000,5.5,5.5,8.500000,5.500000,6.0,,5.000000,5.973684
4,1994,SET,,6.500000,6.50,6.5,5.500000,5.00,5.500000,6.5,...,6.50,6.500000,5.5,5.5,8.500000,5.500000,5.5,,5.000000,5.947368
5,1994,OUT,,6.500000,5.50,6.5,5.500000,5.00,6.000000,6.5,...,6.50,6.000000,5.0,5.5,8.500000,5.500000,6.0,,5.000000,5.894737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,2022,AGO,32.5,39.750000,33.74,33.0,31.900000,29.78,31.650000,34.0,...,26.85,34.500000,33.0,34.0,48.500000,37.000000,31.6,37.0,35.620000,
340,2022,SET,,,,,,,,,...,,,,,,,,,,
341,2022,OUT,,,,,,,,,...,,,,,,,,,,
342,2022,NOV,,,,,,,,,...,,,,,,,,,,


In [319]:
preco_saco["mes"] = preco_saco["mes"].apply(convert_months)

In [320]:
preco_saco = preco_saco[preco_saco['ano'] < 2020]

In [321]:
preco_saco["data"] = pd.to_datetime(preco_saco["mes"].astype('int64') + preco_saco["ano"] * 100 , format='%Y%m')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preco_saco["data"] = pd.to_datetime(preco_saco["mes"].astype('int64') + preco_saco["ano"] * 100 , format='%Y%m')


In [322]:
preco_saco = preco_saco.drop(['ano', 'mes', 'Brasil'], axis=1)

In [323]:
preco_saco = pd.melt(preco_saco, id_vars=['data'], value_vars=preco_saco.iloc[:-1], var_name='estados', value_name='preco_saco')

In [324]:
df=pd.merge(df, preco_saco, how="left", on=["data", "estados"])

In [325]:
search_duplicates(df)

Todas as datas repetem 27 vezes? True
Estados unicos: 27
Shape: (11340, 20)
Duplicados? False


### Preço kg

In [326]:
xls = pd.ExcelFile("../dados/bases/preco_kg.xlsx", engine="openpyxl")

In [327]:
xls.sheet_names

['tabela_07.A.05']

In [328]:
preco_kg = pd.read_excel(xls, skiprows=3)

In [329]:
preco_kg.head()

Unnamed: 0,ANO/MÊS,Unnamed: 1,Unnamed: 2,UNIDADES DA FEDERAÇÃO,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Média
0,,,AL,AM,BA,CE,DF,ES,GO,MA,...,PB,PE,PR,RJ,RO,RS,SC,SE,SP,Brasil
1,1994.0,JUN,...,0.136698,0.128,0.132,0.104095,0.102,0.112545,0.14,...,0.13,0.130735,0.11,0.112,0.140833,0.114509,0.12,...,0.093982,0.117604
2,,JUL,...,0.14,0.13,0.13,0.12,0.1,0.11,0.14,...,0.13,0.13,0.11,0.11,0.17,0.11,0.12,...,0.1,0.122105
3,,AGO,...,0.13,0.12,0.13,0.12,0.1,0.11,0.13,...,0.13,0.13,0.11,0.11,0.17,0.11,0.12,...,0.1,0.119474
4,,SET,...,0.13,0.13,0.13,0.11,0.1,0.11,0.13,...,0.13,0.13,0.11,0.11,0.17,0.11,0.11,...,0.1,0.118947


In [330]:
preco_kg.columns=['ano', 'mes'] + list(preco_kg.iloc[0].values[2:])
preco_kg = preco_kg.iloc[1:]

In [331]:
preco_kg.tail()

Unnamed: 0,ano,mes,AL,AM,BA,CE,DF,ES,GO,MA,...,PB,PE,PR,RJ,RO,RS,SC,SE,SP,Brasil
342,,NOV,,,,,,,,,...,,,,,,,,,,...
343,,DEZ,,,,,,,,,...,,,,,,,,,,...
344,Fonte: Sinduscons Estaduais e Banco de Dados-C...,,,,,,,,,,...,,,,,,,,,,
345,Elaboração: Banco de Dados-CBIC,,,,,,,,,,...,,,,,,,,,,
346,(...) Dado não disponível.,,,,,,,,,,...,,,,,,,,,,


In [332]:
preco_kg = preco_kg.iloc[:-3]

In [333]:
preco_kg["mes"] = preco_kg["mes"].apply(convert_months)

In [334]:
preco_kg["ano"] = preco_kg["ano"].fillna(method='ffill')
preco_kg = preco_kg[preco_kg['ano'] < 2020]
preco_kg["data"] = pd.to_datetime(preco_kg["mes"].astype('int64') + preco_kg["ano"] * 100 , format='%Y%m')

In [335]:
preco_kg = preco_kg.drop(['ano', 'mes','Brasil'], axis=1)

In [336]:
preco_kg = pd.melt(preco_kg, id_vars=['data'], value_vars=preco_kg.iloc[:-1], var_name='estados', value_name='preco_kg')

In [337]:
preco_kg = preco_kg.replace(f"...", np.nan)

In [338]:
df = pd.merge(df, preco_kg, how="left", on=["data", "estados"])

# Salvando os dados

In [339]:
df

Unnamed: 0,estados,data,pib_pc,pib_pmc,pib_pcpt,pib_cc,populacao,incc,ipca,igp,...,idh_l,idh_r,idh_e,nfsp,estoque,desemprego,producao,preco_ton,preco_saco,preco_kg
0,AC,1985-01-01,2.719967e+05,5.168208e-05,0.767970,207486.441959,,7.53,11.76,12.64,...,,,,,3.799125e+06,,,,,
1,AC,1985-02-01,2.719967e+05,5.168208e-05,0.767970,207486.441959,,13.05,23.90,10.16,...,,,,,3.799125e+06,,,,,
2,AC,1985-03-01,2.719967e+05,5.168208e-05,0.767970,207486.441959,,11.59,36.49,12.71,...,,,,,3.799125e+06,,,,,
3,AC,1985-04-01,2.719967e+05,5.168208e-05,0.767970,207486.441959,,8.80,47.68,7.22,...,,,,,3.799125e+06,,,,,
4,AC,1985-05-01,2.719967e+05,5.168208e-05,0.767970,207486.441959,,22.42,58.31,7.78,...,,,,,3.799125e+06,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11335,TO,2019-08-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.42,2.54,-0.51,...,0.81,0.7,0.73,0.62,7.827728e+06,11.9,,,,
11336,TO,2019-09-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.46,2.49,0.50,...,0.81,0.7,0.73,0.59,7.827728e+06,11.9,114896.0,,,
11337,TO,2019-10-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.18,2.60,0.55,...,0.81,0.7,0.73,0.57,7.827728e+06,11.8,,,,
11338,TO,2019-11-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.04,3.12,0.85,...,0.81,0.7,0.73,0.54,7.827728e+06,11.3,166611.0,,,


In [340]:
df=df[df["data"] >= pd.to_datetime("1991-01-01")].reset_index()

In [341]:
df = df.drop("index", axis=1)

In [342]:
df.to_csv("../dados/input/input-new.csv", index=False)

In [606]:
df

Unnamed: 0,estados,data,pib_pc,pib_pmc,pib_pcpt,pib_cc,populacao,incc,ipca,igp,...,idh_l,idh_r,idh_e,nfsp,estoque,desemprego,producao,preco_ton,preco_saco,preco_kg
0,AC,1991-01-01,3.130869e+05,6.734017e+00,0.726939,213061.839872,417102.0,17.03,20.75,19.93,...,0.645,0.574,0.176,11.17,4.782147e+06,,,,,
1,AC,1991-02-01,3.130869e+05,6.734017e+00,0.726939,213061.839872,417102.0,15.50,45.77,21.11,...,0.645,0.574,0.176,11.17,4.782147e+06,,,,,
2,AC,1991-03-01,3.130869e+05,6.734017e+00,0.726939,213061.839872,417102.0,8.33,63.15,7.25,...,0.645,0.574,0.176,11.17,4.782147e+06,,,,,
3,AC,1991-04-01,3.130869e+05,6.734017e+00,0.726939,213061.839872,417102.0,6.77,71.29,8.74,...,0.645,0.574,0.176,11.17,4.782147e+06,,,,,
4,AC,1991-05-01,3.130869e+05,6.734017e+00,0.726939,213061.839872,417102.0,13.19,84.02,6.53,...,0.645,0.574,0.176,11.17,4.782147e+06,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9391,TO,2019-08-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.42,2.54,-0.51,...,0.810,0.700,0.730,0.62,7.827728e+06,11.9,-999.0,,,
9392,TO,2019-09-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.46,2.49,0.50,...,0.810,0.700,0.730,0.59,7.827728e+06,11.9,114896.0,,,
9393,TO,2019-10-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.18,2.60,0.55,...,0.810,0.700,0.730,0.57,7.827728e+06,11.8,-999.0,,,
9394,TO,2019-11-01,1.685142e+06,2.961535e+06,1.071383,890283.634075,1572866.0,0.04,3.12,0.85,...,0.810,0.700,0.730,0.54,7.827728e+06,11.3,166611.0,,,


# Consumo

In [None]:
PATH = "https://raw.githubusercontent.com/LeiteJu/TCC/main/dados/bases/"

SIGLAS = ["AC","AL","AM","AP","BA","CE",
          "DF","ES","GO","MA","MG","MS",
          "MT","PA","PB","PE","PI","PR",
          "RJ","RN","RO","RR","RS","SC",
          "SE","SP","TO"]

ESTADOS = ['Acre', 'Alagoas', 'Amazonas', 'Amapá', 'Bahia', 'Ceará',
  'Distrito Federal', 'Espírito Santo', 'Goiás', 'Maranhão',
  'Minas Gerais', 'Mato Grosso do Sul', 'Mato Grosso', 'Pará', 'Paraíba',
  'Pernambuco', 'Piauí', 'Paraná', 'Rio de Janeiro',
  'Rio Grande do Norte', 'Rondônia', 'Roraima', 'Rio Grande do Sul',
  'Santa Catarina', 'Sergipe', 'São Paulo', 'Tocantins']

MAP = dict(zip(ESTADOS, SIGLAS))

In [408]:
df_91_00.columns.to_list()

['estados', 'data', 'consumo']

### 2003 em diante

In [11]:
ESTADOS=['ACRE', 'ALAGOAS', 'AMAZONAS', 'AMAPÁ', 'BAHIA', ' CEARÁ',
         'DISTRITO FEDERAL', 'ESPÍRITO SANTO', 'GOIÁS', 'MARANHÃO', 'MINAS GERAIS', 'MATO GROSSO DO SUL', 
         'MATO GROSSO', 'PARÁ', 'PARAÍBA', 'PERNAMBUCO', 'PIAUÍ', 'PARANÁ',
         'RIO DE JANEIRO', 'RIO GRANDE DO NORTE', 'RONDÔNIA',  'RORAIMA', 'RIO GRANDE DO SUL', 'SANTA CATARINA', 
         'SERGIPE', 'SÃO PAULO', 'TOCANTINS', 'CEARÁ']

SIGLAS = ["AC","AL","AM","AP","BA","CE",
          "DF","ES","GO","MA","MG","MS",
          "MT","PA","PB","PE","PI","PR",
          "RJ","RN","RO","RR","RS","SC",
          "SE","SP","TO", 'CE']

MAP = dict(zip(ESTADOS, SIGLAS))

def estados2sigla(x):
    return MAP[x]

In [12]:
def read_table(file, ano):
    
    # ler o arquivo 
    df = pd.read_excel(file, ano, skiprows=3, header=1)
    
    # renomear coluna dos estados e dropar o total
    df = df.rename({'Unnamed: 0':  'estados'}, axis=1)
    df = df.drop("TOTAL", axis=1)
    
    # converte de estados pra siglas
    df = df[df['estados'].isin(ESTADOS)]
    df['estados'] = df['estados'].apply(estados2sigla)
    
    df = df.dropna()
    
    # cria coluna com meses
    df = pd.melt(df, id_vars="estados", var_name="meses", value_name='consumo')
    
    MESES = ['JAN', 'FEV', 'MAR', 'ABR', 'MAI', 'JUN', 
         'JUL', 'AGO', 'SET','OUT', 'NOV', 'DEZ']
    
    def categorize (x):
        return f"{1+MESES.index(x):02d}"
    
    # converte mes para numero
    df["meses"] = df["meses"].apply(categorize)
    
    # obtem a data 
    df['data'] = pd.to_datetime(df['meses'].astype('int64') + int(ano) * 100, format="%Y%m")
    
    return df[['estados', 'data', 'consumo']]
    

In [13]:
xls = pd.ExcelFile(f"{PATH}Consumo_18_anos_mensais.xlsx")

In [14]:
xls.sheet_names

['2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021']

In [24]:
y=pd.DataFrame(columns=['estados', 'data', 'consumo'])

for table in xls.sheet_names[:-2]:
    df_ano = read_table(xls, table)
    y=y.append(df_ano)

  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)
  y=y.append(df_ano)


In [25]:
y=y.sort_values(["data", "estados"])

In [26]:
y[y["consumo"] == "..."]["data"].unique()

array(['2014-03-01T00:00:00.000000000', '2014-04-01T00:00:00.000000000',
       '2014-05-01T00:00:00.000000000', '2014-06-01T00:00:00.000000000',
       '2014-07-01T00:00:00.000000000', '2014-08-01T00:00:00.000000000',
       '2014-09-01T00:00:00.000000000', '2014-10-01T00:00:00.000000000',
       '2014-11-01T00:00:00.000000000', '2014-12-01T00:00:00.000000000',
       '2015-01-01T00:00:00.000000000', '2015-02-01T00:00:00.000000000',
       '2015-03-01T00:00:00.000000000', '2015-04-01T00:00:00.000000000',
       '2015-05-01T00:00:00.000000000', '2015-06-01T00:00:00.000000000',
       '2015-07-01T00:00:00.000000000', '2015-08-01T00:00:00.000000000',
       '2015-09-01T00:00:00.000000000', '2015-10-01T00:00:00.000000000',
       '2015-11-01T00:00:00.000000000', '2015-12-01T00:00:00.000000000',
       '2016-01-01T00:00:00.000000000', '2016-02-01T00:00:00.000000000',
       '2016-03-01T00:00:00.000000000', '2016-04-01T00:00:00.000000000',
       '2016-05-01T00:00:00.000000000', '2016-06-01

## Dados do consumo de 2014 a 2016

In [50]:
y_2014 = pd.read_csv(f"{PATH}consumo_2014_mensais.csv")

HTTPError: HTTP Error 404: Not Found

In [None]:
y_2014=y_2014.infer_objects()

In [47]:
y_2014.head()

Unnamed: 0,REGIÕES_E_ESTADOS,Jan,Fev,Mar,Abr,Mai,Jun,Jul,Ago,Set,Out,Nov,Dez,TOTAL
0,REGIÃO_NORTE,419.454,369.631,385.252,433.438,397.871,450.903,449.819,486.738,477.607,515.475,458.656,421.411,5.266.256
1,Rondônia,54.117,59.15,57.345,56.187,63.403,73.497,73.696,72.366,75.142,73.101,62.809,51.082,771.895
2,Acre,14.602,17.681,6.222,18.829,16.499,15.178,22.065,19.242,23.464,19.835,21.601,15.832,211.050
3,Amazonas,63.088,80.43,81.788,72.995,46.056,96.525,58.639,96.194,59.788,94.092,53.72,57.359,860.674
4,Roraima,10.334,9.723,10.85,11.072,9.501,11.617,10.944,11.447,11.006,13.322,11.245,8.592,129.653


In [48]:
y_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   REGIÕES_E_ESTADOS  36 non-null     object
 1   Jan                36 non-null     object
 2   Fev                36 non-null     object
 3   Mar                36 non-null     object
 4   Abr                36 non-null     object
 5   Mai                36 non-null     object
 6   Jun                36 non-null     object
 7   Jul                36 non-null     object
 8   Ago                36 non-null     object
 9   Set                36 non-null     object
 10  Out                36 non-null     object
 11  Nov                36 non-null     object
 12  Dez                36 non-null     object
 13  TOTAL              36 non-null     object
dtypes: object(14)
memory usage: 4.1+ KB


In [422]:
consumo.to_csv('../dados/input/target.csv', index=False)