In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numba import jit
from multiprocessing import Pool
import concurrent.futures

import swifter
from swifter import set_defaults
set_defaults(
    progress_bar=False,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
N = 100
PATH = "microdados_enem_2022/DADOS/"
PATH_PROCESSED = "dados_processados/" 

In [3]:
questions = pd.read_csv(PATH + "ITENS_PROVA_2022.csv",encoding="latin",sep=";")


In [4]:
prova_azul = questions[questions["TX_COR"] == "AZUL"]

In [5]:
hab_questionary = pd.read_csv(PATH + "QUEST_HAB_ESTUDO.csv",encoding="latin",sep=";", nrows=N)

In [6]:
questions_features = ["NU_INSCRICAO", "TX_RESPOSTAS_CN","TX_RESPOSTAS_CH","TX_RESPOSTAS_LC","TX_RESPOSTAS_MT",'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC',
       'CO_PROVA_MT']

values_to_match = {
    'CO_PROVA_CN': 1085,
    'CO_PROVA_CH': 1055,
    'CO_PROVA_LC': 1065,
    'CO_PROVA_MT': 1075,
}
answers_features = questions_features[1:]

In [7]:
gabarito_CN = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_CN']].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_CH = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_CH']].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_LC = prova_azul[(prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_LC']) & (prova_azul["TP_LINGUA"] != 1)].sort_values(by='CO_POSICAO')["TX_GABARITO"]
gabarito_MT = prova_azul[prova_azul["CO_PROVA"] == values_to_match['CO_PROVA_MT']].sort_values(by='CO_POSICAO')["TX_GABARITO"]

gabaritos = [gabarito_CN, gabarito_CH, gabarito_LC, gabarito_MT]

## Functions to pre_process the data

In [8]:
def vectorize_strings(string):
    return np.array(list(string))

In [9]:
def task(feature):
        i,area,microdados,gabaritos = feature
        respostas = microdados[area]
        vectorized_resps   = respostas.swifter.apply(vectorize_strings)
        right_wrong_corr   = vectorized_resps.swifter.apply(lambda x: x == gabaritos[i])
        vectorized_resps["NU_INCRICAO"] = microdados["NU_INSCRICAO"]
        right_wrong_corr["NU_INCRICAO"] = microdados["NU_INSCRICAO"]
        right_wrong_corr.to_csv(PATH_PROCESSED + "" + area + ".csv",sep=";",mode='a')
        vectorized_resps.to_csv(PATH_PROCESSED + "Vectorized_" + area + ".csv",sep=";",mode='a')

def get_gabaritos(answers_features,microdados,gabaritos):
    values = list(enumerate(answers_features))
    values = [x + (microdados,gabaritos) for x in values]
    with Pool() as pool:
        pool.map(task,values)

In [10]:
def process_chunk(chunk):
    microdados = chunk[questions_features].dropna()
    microdados_azul = microdados[(microdados['CO_PROVA_MT'] == values_to_match['CO_PROVA_MT'])]
    get_gabaritos(answers_features[:-4],microdados_azul,gabaritos)

## Pre Processamento dos microdados

In [14]:
N = 10000
num_workers = 100
microdados_reader = pd.read_csv(PATH + "MICRODADOS_ENEM_2022.csv",encoding="latin",sep=";",chunksize=N)

with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Use the executor to process each chunk in parallel
    futures = [executor.submit(process_chunk, chunk) for chunk in microdados_reader]

    # Wait for all processing tasks to complete
    concurrent.futures.wait(futures)