In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numba import jit
from multiprocessing import Pool
import concurrent.futures

import swifter
from swifter import set_defaults
set_defaults(
    progress_bar=False,
)

In [15]:
N = 100
PATH = "microdados_enem_2022/DADOS/"
PATH_PROCESSED = "dados_processados/" 

In [16]:
questions = pd.read_csv(PATH + "ITENS_PROVA_2022.csv",encoding="latin",sep=";")


In [17]:
prova_azul = questions[questions["TX_COR"] == "AZUL"]

In [18]:
hab_questionary = pd.read_csv(PATH + "QUEST_HAB_ESTUDO.csv",encoding="latin",sep=";", nrows=N)

In [19]:
questions_features = ["NU_INSCRICAO", "TX_RESPOSTAS_CN","TX_RESPOSTAS_CH","TX_RESPOSTAS_LC","TX_RESPOSTAS_MT",'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC',
       'CO_PROVA_MT']

values_to_match = {
    'CO_PROVA_CN': 1085,
    'CO_PROVA_CH': 1055,
    'CO_PROVA_LC': 1065,
    'CO_PROVA_MT': 1075,
}
answers_features = questions_features[1:]

In [20]:
gabarito_CN = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_CN']].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_CH = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_CH']].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_LC = prova_azul[(prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_LC']) & (prova_azul["TP_LINGUA"] != 1)].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_MT = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_MT']].sort_values(by='CO_POSICAO')["TX_GABARITO"]

gabaritos = [gabarito_CN, gabarito_CH, gabarito_LC, gabarito_MT]

## Functions to pre_process the data

In [21]:
def to_json_append(df,file):
    '''
    Load the file with
    pd.read_json(file,orient='records',lines=True)
    '''
    df.to_json('tmp.json',orient='records',lines=True)
    #append
    f = open('tmp.json','r')
    temp = f.read()
    f.close()
    
    f = open(file,'a')
    f.write(temp )
    f.close()

In [22]:
def vectorize_strings(string):
    return np.array(list(string))

In [23]:
def task(feature):
        i,area,microdados,gabaritos = feature
        respostas = microdados[area]
        vectorized_resps   = respostas.swifter.apply(vectorize_strings)
        right_wrong_corr   = vectorized_resps.swifter.apply(lambda x: x == gabaritos[i])
        df       = pd.DataFrame(microdados["NU_INSCRICAO"])
        df[area] = respostas
        outfile_path = PATH_PROCESSED + "" + area + ".csv"
        df.to_csv(outfile_path,index=False, mode="a",sep=";",header=False)

        

        df       = pd.DataFrame(microdados["NU_INSCRICAO"])
        df       = pd.concat((df,right_wrong_corr),axis=1)

        outfile_path = PATH_PROCESSED + "CORRECTED_" + area + ".csv"
        df.to_csv(outfile_path,index=False, mode="a",sep=";",header=False)

def get_gabaritos(answers_features,microdados,gabaritos):
    values = list(enumerate(answers_features))
    values = [x + (microdados,gabaritos) for x in values]
    with Pool() as pool:
        pool.map(task,values)

In [24]:
def process_chunk(chunk):
    microdados = chunk[questions_features].dropna()
    microdados_azul = microdados[(microdados['CO_PROVA_MT'] == values_to_match['CO_PROVA_MT'])]
    get_gabaritos(answers_features[:-4],microdados_azul,gabaritos)

## Pre Processamento dos microdados

In [25]:
N = 10000
num_workers = 1000
microdados_reader = pd.read_csv(PATH + "MICRODADOS_ENEM_2022.csv",encoding="latin",sep=";",chunksize=N)

In [26]:

with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Use the executor to process each chunk in parallel
    futures = [executor.submit(process_chunk, chunk) for chunk in microdados_reader]

    # Wait for all processing tasks to complete
    concurrent.futures.wait(futures)

Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 116570.00it/s]
Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 140070.51it/s]
Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 78739.57it/s]
Pandas Apply: 100%|██████████| 1578/1578 [00:00<00:00, 127000.13it/s]


Pandas Apply: 100%|██████████| 1567/1567 [00:00<00:00, 82188.81it/s]

Pandas Apply: 100%|██████████| 1567/1567 [00:00<00:00, 106711.60it/s]
Pandas Apply: 100%|██████████| 1567/1567 [00:00<00:00, 75186.17it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 141090.66it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 97776.94it/s]
Pandas Apply: 100%|██████████| 1616/1616 [00:00<00:00, 117075.95it/s]

Pandas Apply: 100%|██████████| 1738/1738 [00:00<00:00, 116220.53it/s]

Pandas Apply: 100%|██████████| 1738/1738 [00:00<00:00, 128793.29it/s]
Pandas Apply: 100%|██████████| 1738/1738 [00:00<00:00, 123506.10it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 136483.51it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 135282.47it/s]
Pandas Apply: 100%|██████████| 1730/1730 [00:00<00:00, 135857.44it/s]

Pandas Apply: 100%|██████████| 1799/1799 [00:00<00:00, 141358.08it/s]
Pandas Apply: 100%|██████████| 1799/1799 [00:00<00:00, 142897.37it/s]

Pandas Apply: 100%

In [27]:
for i,area in enumerate(gabaritos):
    area.to_csv(PATH_PROCESSED + "gabaritos/gabarito_" +  questions_features[i+1] + ".csv")

In [28]:
for chunk in microdados_reader:
    area = "TX_RESPOSTAS_CH"
    microdados = chunk[questions_features].dropna()
    microdados_azul = microdados[(microdados['CO_PROVA_MT'] == values_to_match['CO_PROVA_MT'])]
    respostas = microdados_azul[area]
    vectorized_resps   = respostas.swifter.apply(vectorize_strings)
    right_wrong = vectorized_resps.swifter.apply(lambda x: x == gabaritos[i])
    df       = pd.DataFrame(microdados_azul["NU_INSCRICAO"])
    df[area] = vectorized_resps

    outfile_path = PATH_PROCESSED + "" + area + ".csv"
    df.to_csv(outfile_path,index=False, mode="a",sep=";",header=False)

    

    df       = pd.DataFrame(microdados_azul["NU_INSCRICAO"])
    df       = pd.concat((df,right_wrong),axis=1)

    outfile_path = PATH_PROCESSED + "CORRECTED" + area + ".csv"
    df.to_csv(outfile_path,index=False, mode="a",sep=";",header=False)
    break