In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numba import jit
from multiprocessing import Pool
import concurrent.futures

import swifter
from swifter import set_defaults
set_defaults(
    progress_bar=False,
)

In [3]:
N = 100
PATH = "microdados_enem_2022/DADOS/"
PATH_PROCESSED = "dados_processados/" 

In [4]:
questions = pd.read_csv(PATH + "ITENS_PROVA_2022.csv",encoding="latin",sep=";")

In [5]:
prova_azul = questions[questions["TX_COR"] == "AZUL"]

In [6]:
hab_questionary = pd.read_csv(PATH + "QUEST_HAB_ESTUDO.csv",encoding="latin",sep=";", nrows=N)

In [7]:
questions_features = ["NU_INSCRICAO", "TX_RESPOSTAS_CN","TX_RESPOSTAS_CH","TX_RESPOSTAS_LC","TX_RESPOSTAS_MT",'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC',
       'CO_PROVA_MT']

values_to_match = {
    'CO_PROVA_CN': 1085,
    'CO_PROVA_CH': 1055,
    'CO_PROVA_LC': 1065,
    'CO_PROVA_MT': 1075,
}
answers_features = questions_features[1:]

In [8]:
gabarito_CN = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_CN']].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_CH = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_CH']].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_LC = prova_azul[(prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_LC']) & (prova_azul["TP_LINGUA"] != 1)].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_MT = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_MT']].sort_values(by='CO_POSICAO')["TX_GABARITO"]

gabaritos = [gabarito_CN, gabarito_CH, gabarito_LC, gabarito_MT]

## Functions to pre_process the data

In [9]:
def to_json_append(df,file):
    '''
    Load the file with
    pd.read_json(file,orient='records',lines=True)
    '''
    df.to_json('tmp.json',orient='records',lines=True)
    #append
    f = open('tmp.json','r')
    temp = f.read()
    f.close()
    
    f = open(file,'a')
    f.write(temp )
    f.close()

In [10]:
def vectorize_strings(string):
    return np.array(list(string))

In [11]:
def task(feature):
        i,area,microdados,gabaritos = feature
        respostas = microdados[area]
        vectorized_resps   = respostas.swifter.apply(vectorize_strings)
        right_wrong_corr   = vectorized_resps.swifter.apply(lambda x: x == gabaritos[i])
        df       = pd.DataFrame(microdados["NU_INSCRICAO"])
        df[area] = respostas
        outfile_path = PATH_PROCESSED + "" + area + ".csv"
        df.to_csv(outfile_path,index=False, mode="a",sep=";",header=False)

        

        df       = pd.DataFrame(microdados["NU_INSCRICAO"])
        df       = pd.concat((df,right_wrong_corr),axis=1)

        outfile_path = PATH_PROCESSED + "CORRECTED_" + area + ".csv"
        df.to_csv(outfile_path,index=False, mode="a",sep=";",header=False)

def get_gabaritos(answers_features,microdados,gabaritos):
    values = list(enumerate(answers_features))
    values = [x + (microdados,gabaritos) for x in values]
    with Pool() as pool:
        pool.map(task,values)

In [12]:
def process_chunk(chunk):
    microdados = chunk[questions_features].dropna()
    microdados_azul = microdados[(microdados['CO_PROVA_MT'] == values_to_match['CO_PROVA_MT'])]
    get_gabaritos(answers_features[:-4],microdados_azul,gabaritos)

## Pre Processamento dos microdados

In [13]:
N = 10000
num_workers = 1000
microdados_reader = pd.read_csv(PATH + "MICRODADOS_ENEM_2022.csv",encoding="latin",sep=";",chunksize=N)

In [13]:

with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Use the executor to process each chunk in parallel
    futures = [executor.submit(process_chunk, chunk) for chunk in microdados_reader]

    # Wait for all processing tasks to complete
    concurrent.futures.wait(futures)

Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 111729.16it/s]
Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 120235.65it/s]

Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 125568.91it/s]
Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 122421.79it/s]

Pandas Apply: 100%|██████████| 1567/1567 [00:00<00:00, 84637.94it/s]]


Pandas Apply: 100%|██████████| 1567/1567 [00:00<00:00, 101969.97it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 126738.88it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 116690.97it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 99512.50it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 101622.17it/s]
Pandas Apply: 100%|██████████| 1738/1738 [00:00<00:00, 119884.56it/s]

Pandas Apply: 100%|██████████| 1738/1738 [00:00<00:00, 121667.37it/s]
Pandas Apply: 100%|██████████| 1738/1738 [00:00<00:00, 120286.13it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 95731.31it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 100613.51it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 93046.60it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 94336.12it/s]
Pandas Apply: 100%|██████████| 1799/1799 [00:00<00:00, 96209.94it/s]

Pandas Apply: 100%|██

In [14]:
for i,area in enumerate(gabaritos):
    area.to_csv(PATH_PROCESSED + "gabaritos/gabarito_" +  questions_features[i+1] + ".csv")

In [15]:
for chunk in microdados_reader:
    area = "TX_RESPOSTAS_CH"
    microdados = chunk[questions_features].dropna()
    print(chunk)
    microdados_azul = microdados[(microdados['CO_PROVA_MT'] == values_to_match['CO_PROVA_MT'])]
    break

       NU_INSCRICAO  NU_ANO  TP_FAIXA_ETARIA TP_SEXO  TP_ESTADO_CIVIL  \
10000  210056071640    2022                3       F                1   
10001  210056517317    2022                3       F                1   
10002  210057806368    2022                2       F                1   
10003  210054973456    2022                5       M                1   
10004  210055793207    2022                9       F                1   
...             ...     ...              ...     ...              ...   
19995  210054840403    2022                3       F                0   
19996  210055832078    2022                7       F                1   
19997  210057278020    2022                2       F                1   
19998  210054554245    2022                4       M                1   
19999  210055418494    2022                3       M                1   

       TP_COR_RACA  TP_NACIONALIDADE  TP_ST_CONCLUSAO  TP_ANO_CONCLUIU  \
10000            3                 1             