In [2]:
# %load arima_baseline_para.py
#!/usr/bin/env python3

import time
import datetime
import warnings
import numpy as np
import pandas as pd

from pmdarima.arima import auto_arima
from multiprocessing import Pool
from multiprocessing import cpu_count


def data_process(process_id, city_data_list, date_dt, tmp_df_columns):
    print("process {} start".format(process_id))
    preds_df = pd.DataFrame()

    for city_data in city_data_list:
        # print("process {} city {}".format(process_id, city_data))
        district_code = city_data[0]
        sub_df = city_data[1]
        city_code = sub_df['city_code'].iloc[0]
        predict_columns = ['dwell', 'flow_in', 'flow_out']
        tmp_df = pd.DataFrame(data=date_dt, columns=['date_dt'])
        tmp_df['city_code'] = city_code
        tmp_df['district_code'] = district_code

        for column in predict_columns:
            print("process {}: city {}, column {}".format(process_id, district_code, column))
            ts_log = np.log(1 + sub_df[column])
            arima_model = auto_arima(ts_log, start_p=1, max_p=9, start_q=1, max_q=9, max_d=5,
                                     start_P=1, max_P=9, start_Q=1, max_Q=9, max_D=5,
                                     m=7, random_state=2018,
                                     trace=False,
                                     seasonal=True,
                                     error_action='ignore',
                                     suppress_warnings=True,
                                     stepwise=True)

            preds = arima_model.predict(n_periods=15)
            preds = pd.Series(preds)
            preds = np.exp(preds) - 1
            tmp_df = pd.concat([tmp_df, preds], axis=1)

        tmp_df.columns = tmp_df_columns
        preds_df = pd.concat([preds_df, tmp_df], axis=0, ignore_index=True)
        
    print("process {} finished".format(process_id))
    return preds_df


def multiple_prosess():
    start_time = time.clock()
    warnings.filterwarnings('ignore')

    flow_df = pd.read_csv('data/filter/flow_train_filter3.csv')
    flow_df = flow_df.sort_values(by=['city_code', 'district_code', 'date_dt'])

    date_dt = list()
    init_date = datetime.date(2018, 3, 2)

    for delta in range(15):
        _date = init_date + datetime.timedelta(days=delta)
        date_dt.append(_date.strftime('%Y%m%d'))

    district_code_values = flow_df['district_code'].unique()
    preds_df = pd.DataFrame()

    process_num = cpu_count() - 3
    tmp_df_columns = ['date_dt', 'city_code', 'district_code', 'dwell', 'flow_in', 'flow_out']
    city_data_total = [[district_code, flow_df[flow_df["district_code"] == district_code]] for district_code in
                       district_code_values]
    city_index = np.linspace(0, len(city_data_total), process_num + 1, dtype=np.int32)

    pool = Pool(process_num)
    pool_list = [pool.apply_async(data_process, args=(i + 1, city_data_total[city_index[i]: city_index[i + 1]],
                                                      date_dt, tmp_df_columns))
                 for i in np.arange(len(city_index) - 1)]
    pool.close()
    pool.join()
    results = [p.get() for p in pool_list]

    for result in results:
        preds_df = pd.concat([preds_df, result], axis=0, ignore_index=True)

    preds_df = preds_df.sort_values(by=['date_dt'])

    finish_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    preds_df.to_csv('prediction_{}.csv'.format(finish_time), index=False, header=False)
    end_time = time.clock()
    print("used time: {} minutes".format((end_time - start_time)/60.0))
    print("used time: {:.2f} minutes".format((end_time - start_time)/60.0))

if __name__ == '__main__':
    multiple_prosess()


process 1 start
process 1: city 032c75c11f45ab3abf41506d616af280, column dwell
process 1: city 032c75c11f45ab3abf41506d616af280, column flow_in
process 1: city 032c75c11f45ab3abf41506d616af280, column flow_out
process 1: city 077ff638b4c221fc3abb88fa9bd499a7, column dwell
process 1: city 077ff638b4c221fc3abb88fa9bd499a7, column flow_in
process 1: city 077ff638b4c221fc3abb88fa9bd499a7, column flow_out
process 1: city 3a3630930e4f054033e0c64a728f759a, column dwell
process 1: city 3a3630930e4f054033e0c64a728f759a, column flow_in
process 1: city 3a3630930e4f054033e0c64a728f759a, column flow_out
process 1: city 4d3b1591070ad975f4c0b7232485980d, column dwell
process 1: city 4d3b1591070ad975f4c0b7232485980d, column flow_in
process 1: city 4d3b1591070ad975f4c0b7232485980d, column flow_out
process 1: city 4fe4bd4a69e1d7a5550d28ef71c8dcaa, column dwell
process 1: city 4fe4bd4a69e1d7a5550d28ef71c8dcaa, column flow_in
process 1: city 4fe4bd4a69e1d7a5550d28ef71c8dcaa, column flow_out
process 1: cit

process 1: city eb8ef95c3874a09518095f3110c01e2d, column flow_in
process 1: city eb8ef95c3874a09518095f3110c01e2d, column flow_out
process 1: city 05e37aecd15e7e3f7fb6dbb8e4713837, column dwell
process 1: city 05e37aecd15e7e3f7fb6dbb8e4713837, column flow_in
process 1: city 05e37aecd15e7e3f7fb6dbb8e4713837, column flow_out
process 1: city 0b28636d08bff292de12f663d522c55f, column dwell
process 1: city 0b28636d08bff292de12f663d522c55f, column flow_in
process 1: city 0b28636d08bff292de12f663d522c55f, column flow_out
process 1: city 12da6611282638727b4f969b4a2ea373, column dwell
process 1: city 12da6611282638727b4f969b4a2ea373, column flow_in
process 1: city 12da6611282638727b4f969b4a2ea373, column flow_out
process 1: city 1df63b98ae38be96c378a9923b4f8c2a, column dwell
process 1: city 1df63b98ae38be96c378a9923b4f8c2a, column flow_in
process 1: city 1df63b98ae38be96c378a9923b4f8c2a, column flow_out
process 1: city 2e24920cddb34084f14af14a1a430291, column dwell
process 1: city 2e24920cddb340

process 1: city 6ecd1a38b37826ffdb76fab0b744f042, column flow_out
process 1: city 6f8199333234985aceb0f83bc046a918, column dwell
process 1: city 6f8199333234985aceb0f83bc046a918, column flow_in
process 1: city 6f8199333234985aceb0f83bc046a918, column flow_out
process 1: city 7782cd408e0cdc423dc9f75721bb6223, column dwell
process 1: city 7782cd408e0cdc423dc9f75721bb6223, column flow_in
process 1: city 7782cd408e0cdc423dc9f75721bb6223, column flow_out
process 1: city 82271c9f4c9bf3b072ebe43b29d01caf, column dwell
process 1: city 82271c9f4c9bf3b072ebe43b29d01caf, column flow_in
process 1: city 82271c9f4c9bf3b072ebe43b29d01caf, column flow_out
process 1: city 83d680ca232f903aa3402e7618894ec5, column dwell
process 1: city 83d680ca232f903aa3402e7618894ec5, column flow_in
process 1: city 83d680ca232f903aa3402e7618894ec5, column flow_out
process 1: city 8aef48d33924fa6298b9820b9dde869e, column dwell
process 1: city 8aef48d33924fa6298b9820b9dde869e, column flow_in
process 1: city 8aef48d33924fa