In [61]:
# %load_ext autoreload
# # from ggplot import *
# %autoreload 3
from sqlalchemy import Table, Column, String, Integer, Float, Boolean, MetaData, insert, select, BIGINT, Date, DateTime, VARCHAR
from util import *

In [385]:
def process_batch(file):
    
    batch = pd.read_pickle(file)
    
    result_lst = []
    for idx, group_by_item_id in batch.groupby('ITEM_ID'):
        tmp = list(group_by_item_id.groupby('STOCK_ID'))[0][1]    
        result_lst.append(get_feature_engineered_bundle(tmp))


    results = [result for result in result_lst if result != None]
    result_df = pd.DataFrame(results)
    
    # save feature engineered df
    result_df.to_pickle('data/pickle/ivt_item_feature_engineered/%s' % str(file.split('/')[-1]))
    
    # filter dataframe
    filtered_df = get_filtered_fg_df(result_df)

    
    cleaned_item_ids = filtered_df.item_id.values
    cleaned_df = batch[batch['ITEM_ID'].isin(cleaned_item_ids)]
    df_lst =[]
    
    # save images
    save_img(cleaned_df)


    for idx, group in list(cleaned_df.groupby('ITEM_ID')):
        try:
            df_lst.append(get_sell_amount_by_item_id(group))
        except:
            continue
            
    if len(df_lst) > 0:
            
        result = pd.concat(df_lst)
        result.to_sql(con=engine, name='MWS_COLT_ITEM_SELL_AMT_DEV', if_exists='append', flavor='mysql')
        logging.warning('done with %s' % str(file))


In [53]:
def map_clean_up_target_df(stock_id, group_df):

    tmp_df = clean_up_target_df(group_df)[['sell_impute', 'STOCK_AMOUNT', 'STOCK_AMOUNT_imputed']]
    tmp_df['STOCK_ID'] = stock_id
    tmp_df.columns = ['SELL_AMOUNT', 'STOCK_AMOUNT', 'STOCK_AMOUNT_imputed', 'STOCK_ID']

    return tmp_df

In [59]:
def get_sell_amount_by_item_id(df, add_sell_amount=False):
    collect_day = df.COLLECT_DAY.values[0]
    reg_id = df.REG_ID.values[0]
    
    tmp_lst = []
    for stock_id, group_df in list(group.groupby('STOCK_ID')):
        tmp_lst.append(map_clean_up_target_df(stock_id, group_df))    
    result = pd.concat(tmp_lst)
    
    
#     df_pivot = df.pivot_table(index='REG_DT', columns='STOCK_ID', values='STOCK_AMOUNT')
#     sell_amount_by_stock = df_pivot.apply(map_clean_up_target_df)

#     if add_sell_amount:
#         sell_amount_total = sell_amount_by_stock.sum(axis=1)
#         result = pd.DataFrame(sell_amount_total)
#         result.columns = ['SELL_AMOUNT']
#         result['REG_ID'] = reg_id
#     else:
#         sell_amount_by_stock['REG_DT'] = sell_amount_by_stock.index
#         result = pd.melt(sell_amount_by_stock, id_vars=["REG_DT"], var_name="STOCK_ID", value_name="SELL_AMOUNT")

    item_id = df.ITEM_ID.values[0]
    result['ITEM_ID'] = item_id
    result['REG_ID'] = reg_id
    result['UPT_DT'] = pd.to_datetime('now')
    result['COLLECT_DAY'] = collect_day
    result['UPT_ID'] = 'FILTER ALGO'

    return result


In [60]:
if __name__ == '__main__':
    files = glob.glob('data/pickle/ivt_item/*.pkl')[:1]
    engine = get_engine(production=True)
    add_engine_pidguard(engine)    
    tmp_lst = Parallel(n_jobs=-1)(map(delayed(process_batch), files))

In [9]:
# result.COLLECT_DAY = pd.to_datetime(result.COLLECT_DAY)

In [48]:
tmp_df = tmp_lst[0]

In [50]:
tmp_df[['COLLECT_DAY']] = tmp_df[['REG_DT']]

In [51]:
tmp_df

Unnamed: 0,REG_DT,STOCK_ID,SELL_AMOUNT,ITEM_ID,REG_ID,UPT_DT,COLLECT_DAY,UPT_ID
0,2017-11-04,-5303229960599585150,0.0,5621257,SERVER,2018-02-13 11:43:38,2017-11-04,FILTER ALGO
1,2017-11-05,-5303229960599585150,-0.0,5621257,SERVER,2018-02-13 11:43:38,2017-11-05,FILTER ALGO
2,2017-11-06,-5303229960599585150,-0.0,5621257,SERVER,2018-02-13 11:43:38,2017-11-06,FILTER ALGO
3,2017-11-07,-5303229960599585150,-0.0,5621257,SERVER,2018-02-13 11:43:38,2017-11-07,FILTER ALGO
4,2017-11-08,-5303229960599585150,-0.0,5621257,SERVER,2018-02-13 11:43:38,2017-11-08,FILTER ALGO
...,...,...,...,...,...,...,...,...
315,2018-01-18,-5303229960599649138,-0.0,5621257,SERVER,2018-02-13 11:43:38,2018-01-18,FILTER ALGO
316,2018-01-19,-5303229960599649138,1.0,5621257,SERVER,2018-02-13 11:43:38,2018-01-19,FILTER ALGO
317,2018-01-20,-5303229960599649138,-0.0,5621257,SERVER,2018-02-13 11:43:38,2018-01-20,FILTER ALGO
318,2018-01-21,-5303229960599649138,-0.0,5621257,SERVER,2018-02-13 11:43:38,2018-01-21,FILTER ALGO


In [39]:
tmp_dict = tmp_df.iloc[:2,:].T.to_dict().values()

In [40]:
tmp_dict

[{'COLLECT_DAY': '20171104',
  'ITEM_ID': 5621257,
  'REG_DT': Timestamp('2017-11-04 00:00:00'),
  'REG_ID': 'SERVER',
  'SELL_AMOUNT': 0.0,
  'STOCK_ID': '-5303229960599585150',
  'UPT_DT': Timestamp('2018-02-13 11:19:55'),
  'UPT_ID': 'FILTER ALGO'},
 {'COLLECT_DAY': '20171104',
  'ITEM_ID': 5621257,
  'REG_DT': Timestamp('2017-11-05 00:00:00'),
  'REG_ID': 'SERVER',
  'SELL_AMOUNT': -0.0,
  'STOCK_ID': '-5303229960599585150',
  'UPT_DT': Timestamp('2018-02-13 11:19:55'),
  'UPT_ID': 'FILTER ALGO'}]

In [41]:
metadata = MetaData()

data = Table('MWS_COLT_ITEM_SELL_AMT', metadata,
#              Column('ID', Integer(), unique=True, primary_key=True),
             Column('ITEM_ID', Integer()),
             Column('STOCK_ID', String(255)),
             Column('COLLECT_DAY', Date()),
             Column('SELL_AMOUNT', Integer()),
             Column('REG_ID', String(18)),
             Column('REG_DT', DateTime()),
             Column('UPT_DT', DateTime()),
             Column('UPT_ID', String())
)

In [42]:
values_list = [
    {'name': 'Anna', 'count': 1, 'amount': 1000.00, 'valid': True},
    {'name': 'Taylor', 'count': 1, 'amount': 750.00, 'valid': False}
]

In [43]:
engine = get_engine(production=False)

In [45]:
tmp_dict

[{'COLLECT_DAY': '20171104',
  'ITEM_ID': 5621257,
  'REG_DT': Timestamp('2017-11-04 00:00:00'),
  'REG_ID': 'SERVER',
  'SELL_AMOUNT': 0.0,
  'STOCK_ID': '-5303229960599585150',
  'UPT_DT': Timestamp('2018-02-13 11:19:55'),
  'UPT_ID': 'FILTER ALGO'},
 {'COLLECT_DAY': '20171104',
  'ITEM_ID': 5621257,
  'REG_DT': Timestamp('2017-11-05 00:00:00'),
  'REG_ID': 'SERVER',
  'SELL_AMOUNT': -0.0,
  'STOCK_ID': '-5303229960599585150',
  'UPT_DT': Timestamp('2018-02-13 11:19:55'),
  'UPT_ID': 'FILTER ALGO'}]

In [44]:
stmt = insert(data)
results = engine.execute(stmt, tmp_dict)

IntegrityError: (_mysql_exceptions.IntegrityError) (1062, "Duplicate entry '5621257-2017-11-04--5303229960599585150' for key 'UIX_MWS_COLT_ITEM_SELL_AMT_1'") [SQL: u'INSERT INTO `MWS_COLT_ITEM_SELL_AMT` (`ITEM_ID`, `STOCK_ID`, `COLLECT_DAY`, `SELL_AMOUNT`, `REG_ID`, `REG_DT`, `UPT_DT`, `UPT_ID`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'] [parameters: ((5621257, '-5303229960599585150', '20171104', 0.0, 'SERVER', Timestamp('2017-11-04 00:00:00'), Timestamp('2018-02-13 11:19:55'), 'FILTER ALGO'), (5621257, '-5303229960599585150', '20171104', -0.0, 'SERVER', Timestamp('2017-11-05 00:00:00'), Timestamp('2018-02-13 11:19:55'), 'FILTER ALGO'))] (Background on this error at: http://sqlalche.me/e/gkpj)