In [None]:
from swifter import set_defaults
set_defaults(
    progress_bar=False,
)
import pandas as pd
import numpy as np
import swifter
import threading
import multiprocessing
from concurrent.futures import ThreadPoolExecutor
import functools
import os
thread_local = threading.local()



In [None]:
PATH_PROCESSED = "dados_processados/" 
columns = ['NU_INSCRICAO', 'TX_RESPOSTAS']

In [None]:
questions_features = ["TX_RESPOSTAS_CN","TX_RESPOSTAS_CH","TX_RESPOSTAS_LC","TX_RESPOSTAS_MT"]

In [None]:
def get_representative_string(arr,boolean_list):
    indices = [index for index, value in enumerate(boolean_list) if value]
    answers = arr[boolean_list]
    itemset = [f"Q{ind + 1}-{wrong};" for ind,wrong in zip(indices,answers)]
    if(len(itemset)):
        itemset[-1] = itemset[-1][:-1]
    return "".join(itemset)

def get_itemset(row,df_corrected):
    arr = np.array(list(row["TX_RESPOSTAS"]))
    boolean_list    = df_corrected.iloc[(row.name)%len(df_corrected)][1:].tolist()
    w_boolean_list  = np.invert(boolean_list)
    # Create two threads with different parameters

    with ThreadPoolExecutor() as executor:
        # Submit the function with the first parameter
        future1 = executor.submit(get_representative_string,arr,boolean_list)
        # Submit the function with the second parameter
        future2 = executor.submit(get_representative_string,arr,w_boolean_list)

    # Retrieve the results
    r = future1.result()
    w = future2.result()
    # r = get_representative_string(arr,boolean_list)
    # w = get_representative_string(arr,w_boolean_list)
    return r,w

In [None]:
def process_chunks(chunk_pairs,question):
    thread_id = os.getpid()
    answers,answers_corrected = chunk_pairs
    answers.columns     = columns


    tmp = answers.swifter.apply(lambda x: get_itemset(x,answers_corrected),axis=1)
    answers["TX_WRONG"] = tmp.apply(lambda x: x[0])
    answers["TX_RIGHT"] = tmp.apply(lambda x: x[1])
    answers.to_csv(PATH_PROCESSED + "threads/" + "test_" + question + str(thread_id) + ".csv",sep=",",header=None,index=False,mode="a")


In [None]:
N = 1000
num_workers = 10
for question in questions_features:
    file1   = PATH_PROCESSED + question + ".csv"
    file2   = PATH_PROCESSED +"CORRECTED_" + question + ".csv"
    print(file1)
    answers_reader              = pd.read_csv(file1,sep=";",header=None,chunksize=N)
    answers_corrected_reader    = pd.read_csv(file2,sep=";",header=None,chunksize=N)

    c1 = [chunk for chunk in answers_reader]
    c2 = [chunk for chunk in answers_corrected_reader]
    chunk_pairs = list(zip(c1, c2))    
    # for chunk in chunk_pairs:
    #     process_chunks(chunk,question)

    with multiprocessing.Pool() as pool:
        f = functools.partial(process_chunks, question=question)
        results = pool.map(f, chunk_pairs)