In [1]:
import warnings
import gc
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.quantile_regression import QuantReg
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import glob

In [None]:
pd.set_option('display.max_columns', None)

## Funções

In [2]:
def pipeline(data):

    data['NAO_BRANCO'] = np.where(~data['TP_COR_RACA'].isin([1,4]), 1, 0)
    data['MULHER'] = np.where(data['TP_SEXO'] == 'F', 1, 0)
    data.drop(columns=['TP_SEXO'], inplace=True)

    data.Q002 = data.Q002.fillna('H')
    data.Q001 = data.Q001.fillna('H')
    #data.Q006 = data.Q006.fillna(0)

    data.IND_CASA = data.IND_CASA.fillna(0)
    data.IND_CASA = data.IND_CASA.apply(lambda x: np.round(x, 2))
    data['SOLTEIRO'] = np.where(data['TP_ESTADO_CIVIL'].isin([0,1]), 1, 0)

    le = LabelEncoder()
    for col in ['Q001', 'Q002', 'Q006', 'Q022', 'Q024']:
        arr = data[col].values
        le.fit(arr)
        data[col] = le.transform(arr)
    data.Q001 = data.Q001 + 1
    data.Q002 = data.Q002 + 1
    data.Q001 = data.Q001.replace(8,0)
    data.Q002 = data.Q002.replace(8,0) 

    data.dropna(subset=['NT_MEDIA'], inplace=True)

    return data

In [5]:
def return_nan(df):
    na_cols = list()
    for col in df.columns:
        if df[col].isna().any():
            print(col, df[col].isna().sum())
            na_cols.append(col)
    return na_cols

In [34]:
def fit_model(model, q):
    res = model.fit(
        q=q, 
        vcov='robust',
        kernel='epa',
        bandwidth='hsheather', 
        max_iter=1000,
        p_tol=1e-06
    )

    return res.summary()

In [33]:
def create_model(df, cols):
  mod = smf.quantreg(cols, data=df, missing='drop')
  return mod

# Modelo

In [16]:
data = pd.read_parquet('../../Resultados/Agg/ENEM/aux/dados_para_modelagem.parquet')

In [18]:
data = pipeline(data)

In [21]:
na_cols = return_nan(data)
data.dropna(subset=['CO_MUNICIPIO_ESC'], inplace=True)

CO_MUNICIPIO_ESC 812110
NO_MUNICIPIO_ESC 812110
SG_UF_ESC 812110
TP_DEPENDENCIA_ADM_ESC 812110
TP_ESTADO_CIVIL 22365
TP_LOCALIZACAO_ESC 812110
Q005 8133
Q025 8133


In [22]:
data.shape

(2943384, 26)

In [27]:
data.groupby('NU_ANO').size()

NU_ANO
2018.0    1022511
2019.0     879461
2020.0     511180
2021.0     530232
dtype: int64

### Remoção de caracteres proibidos dos nomes das colunas

---



In [28]:
data.head()

Unnamed: 0,NU_ANO,CO_MUNICIPIO_ESC,NO_MUNICIPIO_ESC,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_FAIXA_ETARIA,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ESCOLA,TP_LOCALIZACAO_ESC,...,Q006,Q022,Q024,Q025,IND_CASA,NT_MEDIA,LOG_NT_MEDIA,NAO_BRANCO,MULHER,SOLTEIRO
0,2018.0,1500800,Ananindeua,PA,4.0,2.0,0.0,4.0,4.0,1.0,...,5,3,2,1.0,7.79,503.96,6.222497,0,1,1
2,2018.0,3155603,Rio Pardo de Minas,MG,2.0,3.0,0.0,0.0,2.0,1.0,...,1,3,0,0.0,6.79,497.84,6.210279,1,0,1
3,2018.0,1500800,Ananindeua,PA,4.0,2.0,0.0,3.0,4.0,1.0,...,8,3,0,1.0,7.79,675.64,6.51566,1,1,1
5,2018.0,3161106,São Francisco,MG,2.0,2.0,0.0,3.0,2.0,1.0,...,3,3,0,1.0,5.79,497.78,6.210158,1,1,1
11,2018.0,2927408,Salvador,BA,2.0,2.0,0.0,0.0,2.0,1.0,...,2,3,0,0.0,5.79,576.42,6.356837,1,1,1


 Selecionando alunos de escola pública, de áreas urbanas e rurais separadamente

In [29]:
pub = data[data.TP_ESCOLA==2]

## Regressão

In [31]:
def run_training(df, y_col, file_name=None):
  independent_vars = ['NU_ANO', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NT_MEDIA', 'LOG_NT_MEDIA', 'LOG_NT_MT']
  formula = f"{y_col} ~ {' + '.join(col for col in df.columns if col not in independent_vars)}"
  mod = create_model(df, formula)

  # Fit no modelo
  quantiles = np.arange(0.05, 0.96, 0.1)

  models = {x: fit_model(mod, x) for x in quantiles}

  # Persistindo resultados
  dfs = []
  metadatas = []  

  for k, v in models.items():
    # Note that tables is a list. The table at index 1 is the "core" table. Additionally, read_html puts dfs in a list, so we want index 0
    results_as_html = v.tables[1].as_html()
    metadata = v.tables[0].as_html()
    metadata = pd.read_html(metadata, header=0, index_col=0)[0]
    df = pd.read_html(results_as_html, header=0, index_col=0)[0]
    df['quantile'] = k
    metadata['quantile'] = k
    dfs.append(df)
    metadatas.append(metadata)
  
  all = pd.concat(dfs)
  meta = pd.concat(metadatas, axis=1)

  # Salvando localmente resultados para cada quantil
  #all.to_csv(f'/content/drive/MyDrive/Gabinete/Relatório/Results/resultQReg{file_name}')
  #meta.to_csv(f'/content/drive/MyDrive/Gabinete/Relatório/Results/metaQReg{file_name}')

  return all, meta

In [36]:
pub

Unnamed: 0,NU_ANO,CO_MUNICIPIO_ESC,NO_MUNICIPIO_ESC,SG_UF_ESC,TP_DEPENDENCIA_ADM_ESC,TP_FAIXA_ETARIA,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ESCOLA,TP_LOCALIZACAO_ESC,...,Q006,Q022,Q024,Q025,IND_CASA,NT_MEDIA,LOG_NT_MEDIA,NAO_BRANCO,MULHER,SOLTEIRO
2,2018.0,3155603,Rio Pardo de Minas,MG,2.0,3.0,0.0,0.0,2.0,1.0,...,1,3,0,0.0,6.79,497.84,6.210279,1,0,1
5,2018.0,3161106,São Francisco,MG,2.0,2.0,0.0,3.0,2.0,1.0,...,3,3,0,1.0,5.79,497.78,6.210158,1,1,1
11,2018.0,2927408,Salvador,BA,2.0,2.0,0.0,0.0,2.0,1.0,...,2,3,0,0.0,5.79,576.42,6.356837,1,1,1
13,2018.0,1500107,Abaetetuba,PA,2.0,2.0,0.0,3.0,2.0,1.0,...,5,4,0,0.0,5.79,532.22,6.277057,1,1,1
16,2018.0,2805604,Porto da Folha,SE,2.0,3.0,0.0,4.0,2.0,1.0,...,1,3,0,1.0,5.79,496.96,6.208510,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158197,2021.0,1302603,Manaus,AM,2.0,3.0,1.0,3.0,2.0,1.0,...,3,4,0,1.0,6.79,392.00,5.971262,1,0,1
158204,2021.0,3520905,Ipaussu,SP,2.0,3.0,1.0,1.0,2.0,1.0,...,3,3,1,1.0,9.79,490.68,6.195792,0,1,1
158215,2021.0,3513009,Cotia,SP,2.0,3.0,1.0,1.0,2.0,1.0,...,5,3,0,1.0,7.79,541.80,6.294897,0,0,1
158218,2021.0,2914703,Itaberaba,BA,2.0,5.0,1.0,2.0,2.0,1.0,...,1,1,1,1.0,8.79,459.76,6.130705,1,0,1


In [35]:
all, meta = run_training(pub[pub['NU_ANO']==2020], 'NU_NOTA_MT')

MemoryError: Unable to allocate 29.2 GiB for an array with shape (369108, 10610) and data type float64

In [None]:
for ano in range(2018,2021+1):
    

In [None]:
r19, meta19 = run_training(df=pub19, y_col='LOG_NT_MT', file_name='19-Publicas-Urbanas.csv')

ValueError: ignored

In [None]:
independent_vars = ['NU_ANO', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NT_MEDIA', 'LOG_NT_MEDIA', 'LOG_NT_MT']

In [None]:
ntMatXall = f"LOG_NT_MT ~ {' + '.join(col for col in urb_pub19.columns if col not in independent_vars)}"

In [None]:
mod = create_model(urb_pub19, ntMatXall)

In [None]:
res = mod.fit(
    q=0.5, 
    vcov='robust',
    kernel='gau',
    bandwidth='hsheather', 
    max_iter=1000,
    p_tol=1e-06
)

ValueError: ignored

In [None]:
quantiles = np.arange(0.05, 0.96, 0.1)

models = {x: fit_model(mod, x) for x in quantiles}

ValueError: ignored

In [None]:
dfs = []
metadatas = []

In [None]:
for k, v in models.items():
  # Note that tables is a list. The table at index 1 is the "core" table. Additionally, read_html puts dfs in a list, so we want index 0
  results_as_html = v.tables[1].as_html()
  metadata = v.tables[0].as_html()
  metadata = pd.read_html(metadata, header=0, index_col=0)[0]
  df = pd.read_html(results_as_html, header=0, index_col=0)[0]
  df['quantile'] = k
  metadata['quantile'] = k
  dfs.append(df)
  metadatas.append(metadata)

In [None]:
all = pd.concat(dfs)
meta = pd.concat(metadatas, axis=1)

In [None]:
all.to_csv('/content/drive/MyDrive/Gabinete/Relatório/Results/resultadoQReg19-Publicas-Urbanas.csv')
meta.to_csv('/content/drive/MyDrive/Gabinete/Relatório/Results/metaQReg19-Publicas-Urbanas.csv')
