# Basic multiprocessing implementation 

In [6]:
import time
import multiprocessing
from multiprocessing import Process, Queue

import pandas as pd
import numpy as np

In [2]:
def do_some_computation(q_in, q_out):
    pid = multiprocessing.current_process().pid

    data_in = q_in.get()

    while data_in:
        q_out.put(data_in + 'UPDATED!')
        
        data_in = q_in.get()
        
        
def orchestrate_processors():
    
    data_queue, return_queue = Queue(1), Queue()
    nproc = multiprocessing.cpu_count()
    print('Number of cpus: ', nproc)
    
    procs = [Process(target=do_some_computation, args=(data_queue, return_queue,)) for x in range(nproc)]

    for p in procs:
        p.start()
    
    process_return_data_structure = []
    running = 0
    counter = 1
    while True:
        if counter > 40 and return_queue.empty():
            break
        if running < nproc:
            data_queue.put(f'String for index {counter}')
            counter += 1
            running += 1
        else:
            time.sleep(0.1)
        while not return_queue.empty():
            return_from_process = return_queue.get()
            running -= 1
            process_return_data_structure.append(return_from_process)

    for x in procs:
        data_queue.put(False)
    for proc in procs:
        proc.join()
        
    return process_return_data_structure


In [3]:
output = orchestrate_processors()

Number of cpus:  32


In [4]:
#output
len(output)

39

# Multiprocessing of Dataframe (via a groupby)

In [13]:
n = 10000

df = pd.DataFrame({
    'id': np.arange(0, n, 1),
    'color': np.random.choice(['red', 'blue', 'green', 'yellow'], size=n),
    'method': np.random.choice(['cash', 'credit'], size=n),
    'price': np.random.normal(size=n) + 100 
})

In [14]:
df

Unnamed: 0,id,color,method,price
0,0,blue,credit,99.418801
1,1,yellow,credit,99.721372
2,2,red,credit,99.088344
3,3,red,credit,102.255027
4,4,red,cash,100.042368
...,...,...,...,...
9995,9995,green,credit,100.066886
9996,9996,blue,cash,100.491765
9997,9997,yellow,credit,99.962566
9998,9998,green,credit,100.524023


In [15]:
df2 = df.groupby('color').agg({'price': 'mean', 'method': lambda x:x.value_counts().index[0]})
test1 = df2.reset_index()
test1

Unnamed: 0,color,price,method
0,blue,100.010925,cash
1,green,99.997698,cash
2,red,99.994406,cash
3,yellow,100.023259,credit


In [16]:
def process_group(q_in, q_out):
    pid = multiprocessing.current_process().pid

    data_in = q_in.get()
    
    while type(data_in) == pd.DataFrame:
        group = data_in
        
        mean_price = group.price.mean()
        mode_method = group.method.value_counts().index[0]
        row = pd.DataFrame({'color': [group.color.iloc[0]], 'mean_price': [mean_price], 'mode_method': [mode_method]})
        
        q_out.put(row)
            
        data_in = q_in.get()

    
def orchestrate_df_creation(df):
    
    data_queue, return_queue = Queue(1), Queue()
    
    nproc = multiprocessing.cpu_count()
    
    procs = [Process(target=process_group, args=(data_queue, return_queue,)) for x in range(nproc)]

    for p in procs:
        p.start()
    
    df_row_list = []
    running = 0    
    
    groups = df.groupby('color')
    num_groups = len(groups)
    groups_processed_index = 0
    
    while True:
        
        if groups_processed_index == num_groups and running == 0:
            break

        if running < nproc and groups_processed_index < num_groups:
            group_key = list(groups.groups.keys())[groups_processed_index]
            data_queue.put(groups.get_group(group_key))
            running += 1
            groups_processed_index +=1
        else:
            time.sleep(0.1)

        while not return_queue.empty():
            output_row = return_queue.get()
            running -= 1
            df_row_list.append(output_row)

    for x in procs:
        data_queue.put(False)
    for proc in procs:
        proc.join()

    return df_row_list


In [17]:
%%time
df_row_list = orchestrate_df_creation(df)

CPU times: user 13.2 ms, sys: 86.7 ms, total: 99.9 ms
Wall time: 204 ms


In [18]:
test2 = pd.concat(df_row_list).set_index(pd.Index(np.arange(0,len(df_row_list),1)))
test2

Unnamed: 0,color,mean_price,mode_method
0,blue,100.010925,cash
1,green,99.997698,cash
2,red,99.994406,cash
3,yellow,100.023259,credit
