This is for 1) acquisition, 2) initial spend time series generation.

In [1]:
### Using Darts package to generate covariates (of course we can do manually)
from darts.utils.timeseries_generation import datetime_attribute_timeseries, holidays_timeseries, linear_timeseries
from darts import TimeSeries, concatenate
from darts.dataprocessing.transformers import Scaler
from darts.utils.missing_values import fill_missing_values
import darts
print('darts version:', darts.__version__)

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import shutil

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

DLNAME='tft_google'
TARGET='acq_initialspend'

## Choosing workspace (which has GPU machine)
DGX=True
Databricks=False
AutoDL=False
Colab=False 
KKlaptop=False
GTX1080=False

## Set working directories 
if GTX1080:
    ### spec: 4x GeForce GTX 1080 Ti
    CURRENT_PATH = '/home/zhaoqi/cbcv'
    READ_DIR = '/home/zhaoqi/cbcv/data/weekly_cohort_sample100_zeromasked'

if DGX:
    ### spec: 8x Tesla V100
    CURRENT_PATH = '/home/jhan405/cbcv/tft_google_ver4/jianlin-tft'
    READ_DIR = f'{CURRENT_PATH}/data/weekly_cohort_data_1000'

if Databricks:
    ### cluster
    pass

if AutoDL:
    CURRENT_PATH = '/root/autodl-tmp/Boston'
    READ_DIR = f'{CURRENT_PATH}/data/weekly_cohort_sample100_zeromasked'


### Set where to save
SAVE_DIR =f'{CURRENT_PATH}/data/preprocessed_data/{DLNAME}/{TARGET}' # where to save prediction results


## Create a new directory if it does not exist
import os
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)


darts version: 0.20.0


In [2]:
INPUT_CHUNK_LENGTH = 12

## set date range and time unit
TRAIN_START='2017-01-01' # Q1 2017
TEST_START = '2019-04-01' # Q2 2019
TRAIN_START_INPUTCHUNK=(pd.to_datetime(TRAIN_START) - timedelta(days=7*INPUT_CHUNK_LENGTH)).strftime('%Y-%m-%d')
TRAIN_END=(pd.to_datetime(TEST_START) - timedelta(days=1)).strftime('%Y-%m-%d')
TEST_END='2020-02-29' # Before Covid
TEST_END_EXTEND='2020-05-31' # + additional one quarter (for computational issue)

In [3]:
### read filelist
import glob
filelist=glob.glob(f'{READ_DIR}/*') # all paths for data files
filelist.sort()

K_LIST = range(len(filelist))
print(K_LIST)
print(filelist[:5])


range(0, 1075)
['/home/jhan405/cbcv/tft_google_ver4/data/weekly_cohort_data_1000/1000_jam_city_inc_jam_city_inc_cohort.csv', '/home/jhan405/cbcv/tft_google_ver4/data/weekly_cohort_data_1000/1001_heritage_dairy_heritage_dairy_cohort.csv', '/home/jhan405/cbcv/tft_google_ver4/data/weekly_cohort_data_1000/1002_unitedhealth_optumrx_cohort.csv', '/home/jhan405/cbcv/tft_google_ver4/data/weekly_cohort_data_1000/1003_h_r_block_h_r_diy_cohort.csv', '/home/jhan405/cbcv/tft_google_ver4/data/weekly_cohort_data_1000/1004_bmw_payments_bmw_payments_cohort.csv']


In [4]:
def read_file(K):
    
    ## choose company
    COMPANY_NAME = os.path.splitext(os.path.split(filelist[K])[1])[0][:-len('_cohort')]
    FILEPATH = os.path.splitext(os.path.split(filelist[K])[1])[0] + '.csv'
    print('K =', K, ', COMPANY_NAME:', COMPANY_NAME)

    ## read company
    COL_NAMES_TO_READ = ['acq_week', # group identifier
                    'week', 'tenure', # time identifier
                #  'spend', # target
                    'N_week_cohort', 'acq_quarter', 'merchant_index', 'merchant'] 

    raw_df = pd.read_csv(f'{READ_DIR}/{FILEPATH}') 
                    # usecols=COL_NAMES_TO_READ)[COL_NAMES_TO_READ]

    ### create initial spend
    raw_df['initial_order'] = raw_df['orders'] - raw_df['rpt_orders']
    raw_df['initial_order_per_cust'] = raw_df['initial_order'] / raw_df['N_week_cohort']
    raw_df['initial_spend'] = raw_df['spend'] - raw_df['rpt_spend']
    raw_df['initial_spend_per_cust'] = raw_df['initial_spend'] / raw_df['N_week_cohort']
    raw_df['initial_aov'] = raw_df['initial_spend'] / raw_df['initial_order']
    
    # ignore initial vs repeat
    raw_df['aov'] = raw_df['spend'] / raw_df['orders']
    raw_df['orders_per_cust'] = raw_df['orders'] / raw_df['N_week_cohort']

    raw_df=raw_df[raw_df['tenure']==0] # pick one obs per each acq_week
    print(raw_df.shape[0], 'rows')
    raw_df.sort_values('acq_week')


    # ## Define main columns: {group, time, value}
    # raw_df.rename(columns={'acq_week': 'time', 
    #                     'N_week_cohort': 'value'}, inplace=True) # single time series (one group)

    ## limit data time range
    df = raw_df[ (raw_df['acq_week']<=TEST_END) & (raw_df['acq_week']>=TRAIN_START_INPUTCHUNK) ]
    df.loc[:,'acq_week'] = pd.to_datetime(df['acq_week'])


    # ### fill missing values
    # import pickle
    # time_acq = pd.read_pickle(f'{CURRENT_PATH}/utils/time_acq_template.pickle')

    # ## write down for data with missingness
    # if df.shape[0] != len(time_acq):
    #     file1 = open(f'{CURRENT_PATH}/misc/acq_data_preprocessing.txt',"w")
    #     file1.write(f'{COMPANY_NAME} has {df.shape[0]} rows. {len(time_acq) - df.shape[0]} rows are missing. \n')
    #     file1.close() #to change file access modes

    # df = pd.merge(time_acq, df, left_on = ['time'], right_index=True, how = 'left')
    # df = df.fillna(method = 'ffill') # propagate non-null values forward
    # df = df.fillna(0) # fill with 0
    
    return df

In [5]:
def generate_covariates(df):

    actual_one = TimeSeries.from_dataframe(df, 
                                        time_col='acq_week',
                                        value_cols=['N_week_cohort', # prediction target 
                                                    'initial_order', # = N_week_cohort 
                                                    'initial_order_per_cust', # = 1 always
                                                    'initial_spend', # initial_aov * initial order
                                                    'initial_aov', # prediction target
                                                    'orders',
                                                    'orders_per_cust',
                                                    'spend',
                                                    'aov']) # prediction target


    ### generate covariates (for continuous covariates, applying scaler)
    train_scaler = Scaler()

    ## year (numerical)
    year_series = datetime_attribute_timeseries(
        pd.date_range(start=actual_one.start_time(), 
                    end=pd.Timestamp(TEST_END),
                    freq=actual_one.freq_str, 
                    ),
        attribute="year",
        one_hot=False, # consider it as numerical
    )
    year_series = Scaler().fit_transform(year_series) # n_jobs=-1 for large data

    # ## month (categorical)
    # month_series = datetime_attribute_timeseries(
    #     year_series, 
    #     attribute="month", 
    #     one_hot=True # categorical
    # )

    ## week of year (categorical)
    weekofyear_series = datetime_attribute_timeseries(
                year_series, 
                attribute="weekofyear", 
                one_hot=False,
            )

    holiday_series = holidays_timeseries(
        pd.date_range(start=actual_one.start_time(), 
                    end=pd.Timestamp(TEST_END),
                    freq=actual_one.freq_str, 
                    ),
        country_code='US',
        column_name='holidays',
    )

    linear_trend = linear_timeseries(
                start=actual_one.start_time(),
                end=pd.Timestamp(TEST_END),
                freq=actual_one.freq_str, 
                column_name='linear')  

    quadratic_trend=linear_trend*linear_trend

    # covariates = year_series.stack(holiday_series).stack(weekofyear_series)
    covariates = year_series.stack(holiday_series).stack(weekofyear_series).stack(linear_trend).stack(quadratic_trend)
    data_df = actual_one.stack(covariates).pd_dataframe()

    data_df['merchant_index'] = df['merchant_index'].iloc[0]
    data_df['merchant'] = df['merchant'].iloc[0]
    data_df['category'] = df['category'].iloc[0]
    data_df['subcategory'] = df['subcategory'].iloc[0]

    data_df.reset_index(inplace=True)

    # ## column names back
    # data_df.rename(columns={'time': 'acq_week', 
    #                     'value': 'N_week_cohort'}, inplace=True) # single time series (one group)
    
    return data_df


In [6]:
### for loop
data_df_list = []
for K in K_LIST:
    df = read_file(K)
    
    if (df[df['acq_week']>=TEST_START].shape[0] != 0):
        data_df_list.append(generate_covariates(df))

### combine everything
df_combined = pd.concat(data_df_list).reset_index(drop=True)

# ### for company_0_100.csv, please remove merchant_index=1028, merchant=lyft_bikes_scooters
# indexdrop = df_combined[df_combined['merchant_index'] == 1028].index
# df_combined.drop(indexdrop , inplace=True)

### combine everything
df_combined.to_csv(f'{SAVE_DIR}/company_0_1000.csv')


K = 0 , COMPANY_NAME: 1000_jam_city_inc_jam_city_inc
249 rows
K = 1 , COMPANY_NAME: 1001_heritage_dairy_heritage_dairy
312 rows
K = 2 , COMPANY_NAME: 1002_unitedhealth_optumrx
312 rows
K = 3 , COMPANY_NAME: 1003_h_r_block_h_r_diy
312 rows
K = 4 , COMPANY_NAME: 1004_bmw_payments_bmw_payments
312 rows
K = 5 , COMPANY_NAME: 1005_west_marine_west_marine
312 rows
K = 6 , COMPANY_NAME: 1006_unilever_ben_and_jerries
312 rows
K = 7 , COMPANY_NAME: 1007_albertsons_adjusted_haggen
299 rows
K = 8 , COMPANY_NAME: 1008_albertsons_adjusted_star_market
312 rows
K = 9 , COMPANY_NAME: 1009_marriott_gaylord_hotels
312 rows
K = 10 , COMPANY_NAME: 100_ahold_delhaize_adjusted_giant_landover
312 rows
K = 11 , COMPANY_NAME: 1010_aldo_aldo
312 rows
K = 12 , COMPANY_NAME: 1011_carnival_carnival_cruises
312 rows
K = 13 , COMPANY_NAME: 1012_foodtown_foodtown
312 rows
K = 14 , COMPANY_NAME: 1013_ross_dd_discount
312 rows
K = 15 , COMPANY_NAME: 1014_jane_com_jane_com
312 rows
K = 16 , COMPANY_NAME: 1015_lord_taylo

In [7]:
### sanity check
total_df = pd.concat(data_df_list).reset_index(drop=True)
print(total_df.groupby(['merchant_index'])['merchant_index'].count())
total_df

merchant_index
100     177
101     177
102     177
103     177
104     177
       ... 
1086    177
1087    177
1088    177
1089    177
1090    177
Name: merchant_index, Length: 100, dtype: int64


component,acq_week,N_week_cohort,initial_order,initial_order_per_cust,initial_spend,initial_aov,orders,orders_per_cust,spend,aov,year,holidays,weekofyear,linear,linear_1,merchant_index,merchant,category,subcategory
0,2017-03-12,534.0,534.0,1.0,2788.401,5.221725,931.0,1.743446,4827.361,5.185135,0.0,0.0,10.0,0.000000,0.000000,1000,jam_city_inc,Home Entertainment,Gaming
1,2017-03-19,419.0,419.0,1.0,1742.820,4.159475,635.0,1.515513,2590.120,4.078929,0.0,0.0,11.0,0.006494,0.000042,1000,jam_city_inc,Home Entertainment,Gaming
2,2017-03-26,203.0,203.0,1.0,1044.471,5.145177,320.0,1.576355,1505.301,4.704066,0.0,0.0,12.0,0.012987,0.000169,1000,jam_city_inc,Home Entertainment,Gaming
3,2017-04-02,182.0,182.0,1.0,685.540,3.766703,272.0,1.494505,1125.640,4.138382,0.0,0.0,13.0,0.019481,0.000379,1000,jam_city_inc,Home Entertainment,Gaming
4,2017-04-09,118.0,118.0,1.0,457.820,3.879831,156.0,1.322034,652.440,4.182308,0.0,0.0,14.0,0.025974,0.000675,1000,jam_city_inc,Home Entertainment,Gaming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17437,2020-01-26,405.0,405.0,1.0,29391.245,72.570975,415.0,1.024691,29971.675,72.220904,1.0,0.0,4.0,0.977273,0.955062,1090,piercing_pagoda,Apparel & Accessories,Jewelry & Watches
17438,2020-02-02,451.0,451.0,1.0,32238.135,71.481452,461.0,1.022173,33040.795,71.672007,1.0,0.0,5.0,0.982955,0.966200,1090,piercing_pagoda,Apparel & Accessories,Jewelry & Watches
17439,2020-02-09,635.0,635.0,1.0,51340.315,80.850890,654.0,1.029921,52836.305,80.789457,1.0,0.0,6.0,0.988636,0.977402,1090,piercing_pagoda,Apparel & Accessories,Jewelry & Watches
17440,2020-02-16,590.0,590.0,1.0,39305.815,66.620025,602.0,1.020339,39840.125,66.179610,1.0,0.0,7.0,0.994318,0.988669,1090,piercing_pagoda,Apparel & Accessories,Jewelry & Watches


In [9]:
### why 177 rows
total_df[total_df['merchant_index']==99]

component,acq_week,N_week_cohort,initial_order,initial_order_per_cust,initial_spend,initial_aov,orders,orders_per_cust,spend,aov,year,holidays,weekofyear,linear,linear_1,merchant_index,merchant,category,subcategory
185116,2016-10-09,1769.0,1769.0,1.0,82120.0025,46.421709,2006.0,1.133974,89787.2925,44.759368,0.0,0.0,40.0,0.000000,0.000000,99,giant_carlisle,Grocers,Supermarkets
185117,2016-10-16,1718.0,1718.0,1.0,69498.2325,40.452987,1998.0,1.162980,80214.1500,40.147222,0.0,0.0,41.0,0.005682,0.000032,99,giant_carlisle,Grocers,Supermarkets
185118,2016-10-23,1616.0,1616.0,1.0,66779.4225,41.323900,1872.0,1.158416,76603.8000,40.920833,0.0,0.0,42.0,0.011364,0.000129,99,giant_carlisle,Grocers,Supermarkets
185119,2016-10-30,1546.0,1546.0,1.0,66481.5400,43.002290,1748.0,1.130660,72643.5000,41.558066,0.0,0.0,43.0,0.017045,0.000291,99,giant_carlisle,Grocers,Supermarkets
185120,2016-11-06,1642.0,1642.0,1.0,64692.1375,39.398379,1859.0,1.132156,71081.6275,38.236486,0.0,0.0,44.0,0.022727,0.000517,99,giant_carlisle,Grocers,Supermarkets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185288,2020-01-26,416.0,416.0,1.0,25152.6050,60.462993,455.0,1.093750,26612.8150,58.489703,1.0,0.0,4.0,0.977273,0.955062,99,giant_carlisle,Grocers,Supermarkets
185289,2020-02-02,412.0,412.0,1.0,16633.9875,40.373756,460.0,1.116505,24246.5000,52.709783,1.0,0.0,5.0,0.982955,0.966200,99,giant_carlisle,Grocers,Supermarkets
185290,2020-02-09,431.0,431.0,1.0,15475.4175,35.905841,478.0,1.109049,17257.9075,36.104409,1.0,0.0,6.0,0.988636,0.977402,99,giant_carlisle,Grocers,Supermarkets
185291,2020-02-16,472.0,472.0,1.0,21682.0825,45.936615,530.0,1.122881,23671.3225,44.662873,1.0,0.0,7.0,0.994318,0.988669,99,giant_carlisle,Grocers,Supermarkets


In [7]:
## sanity check

K = 277 # COMPANY_NAME: 267_irs_payments_adjusted_stimulus_payouts
# this company is dropped because it starts after the end of period

## choose company
COMPANY_NAME = os.path.splitext(os.path.split(filelist[K])[1])[0][:-len('_cohort')]
FILEPATH = os.path.splitext(os.path.split(filelist[K])[1])[0] + '.csv'
print('K =', K, ', COMPANY_NAME:', COMPANY_NAME)

## read company
COL_NAMES_TO_READ = ['acq_week', # group identifier
                'week', 'tenure', # time identifier
            #  'spend', # target
                'N_week_cohort', 'acq_quarter', 'merchant_index', 'merchant'] 

raw_df = pd.read_csv(f'{READ_DIR}/{FILEPATH}') 
                # usecols=COL_NAMES_TO_READ)[COL_NAMES_TO_READ]

raw_df=raw_df[raw_df['tenure']==0] # pick one obs per each acq_week
print(raw_df.shape[0], 'rows')
raw_df.sort_values('acq_week')

K = 277 , COMPANY_NAME: 267_irs_payments_adjusted_stimulus_payouts
88 rows


Unnamed: 0,acq_week,week,tenure,acq_quarter,merchant_index,merchant,parent_merchant,category,subcategory,N_week_cohort,active_users,orders,rpt_orders,spend,rpt_spend,initial_order,initial_spend,initial_aov
0,2020-04-05,2020-04-05,0,2020-04-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,190,190,191,1,4.221000e+05,2200.0,190,4.199000e+05,2210.000000
88,2020-04-12,2020-04-12,0,2020-04-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,1258432,1258432,1296899,38467,2.813221e+09,73419800.0,1258432,2.739801e+09,2177.154507
175,2020-04-19,2020-04-19,0,2020-04-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,18279,18279,18349,70,3.378490e+07,121800.0,18279,3.366310e+07,1841.627004
261,2020-04-26,2020-04-26,0,2020-04-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,95657,95657,96399,742,1.828341e+08,1330700.0,95657,1.815034e+08,1897.439811
346,2020-05-03,2020-05-03,0,2020-04-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,49210,49210,49306,96,1.127640e+08,174000.0,49210,1.125900e+08,2287.949604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3901,2021-11-07,2021-11-07,0,2021-10-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,449,449,449,0,1.315261e+06,0.0,449,1.315261e+06,2929.312405
3906,2021-11-14,2021-11-14,0,2021-10-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,191,191,192,1,5.372639e+05,1400.0,191,5.358639e+05,2805.570262
3910,2021-11-21,2021-11-21,0,2021-10-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,141,141,141,0,4.484992e+05,0.0,141,4.484992e+05,3180.845390
3913,2021-11-28,2021-11-28,0,2021-10-01,267,stimulus_payouts,irs_payments_adjusted,Finance,Money Transfer,254,254,254,0,7.050901e+05,0.0,254,7.050901e+05,2775.945354


In [11]:
## sanity check

K = 369 # COMPANY_NAME: 353_kroger_ex_fuel_adjusted_turkey_hill
# this company is dropped because it has no data in prediction period

## choose company
COMPANY_NAME = os.path.splitext(os.path.split(filelist[K])[1])[0][:-len('_cohort')]
FILEPATH = os.path.splitext(os.path.split(filelist[K])[1])[0] + '.csv'
print('K =', K, ', COMPANY_NAME:', COMPANY_NAME)

## read company
COL_NAMES_TO_READ = ['acq_week', # group identifier
                'week', 'tenure', # time identifier
            #  'spend', # target
                'N_week_cohort', 'acq_quarter', 'merchant_index', 'merchant'] 

raw_df = pd.read_csv(f'{READ_DIR}/{FILEPATH}') 
                # usecols=COL_NAMES_TO_READ)[COL_NAMES_TO_READ]

raw_df=raw_df[raw_df['tenure']==0] # pick one obs per each acq_week
print(raw_df.shape[0], 'rows')
raw_df.sort_values('acq_week')

K = 369 , COMPANY_NAME: 353_kroger_ex_fuel_adjusted_turkey_hill
121 rows


Unnamed: 0,acq_week,week,tenure,acq_quarter,merchant_index,merchant,parent_merchant,category,subcategory,N_week_cohort,active_users,orders,rpt_orders,spend,rpt_spend
0,2015-12-27,2015-12-27,0,2015-10-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,5688,5688,7237,1549,148601.19,26159.36
121,2016-01-03,2016-01-03,0,2016-01-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,10505,10505,16045,5540,346715.25,104244.56
241,2016-01-10,2016-01-10,0,2016-01-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,5872,5872,7472,1600,150998.60,28063.58
360,2016-01-17,2016-01-17,0,2016-01-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,3921,3921,4787,866,86976.57,14175.15
478,2016-01-24,2016-01-24,0,2016-01-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,2686,2686,3154,468,57607.15,8856.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7366,2018-03-18,2018-03-18,0,2018-01-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,461,461,508,47,11984.66,785.77
7371,2018-03-25,2018-03-25,0,2018-01-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,524,524,586,62,15825.21,1227.40
7375,2018-04-01,2018-04-01,0,2018-04-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,510,510,585,75,14377.47,1152.73
7378,2018-04-08,2018-04-08,0,2018-04-01,353,turkey_hill,kroger_ex_fuel_adjusted,Grocers,Specialty Grocers,408,408,452,44,12720.26,655.34


In [4]:
READ_DIR

'/home/jhan405/cbcv/tft_google_ver4/jianlin-tft/data/weekly_cohort_data_1000'

In [4]:
import pandas as pd
raw_df = pd.read_csv('/home/jhan405/cbcv/tft_google_ver4/jianlin-tft/data/preprocessed_data/tft_google/company_0_100_acq_initaov.csv') 
# raw_df = raw_df.drop(columns = ['Unnamed: 0'])
raw_df = raw_df.drop(columns = ['V1'])
raw_df = raw_df.sort_values(['merchant_index', 'acq_week'])

raw_df[['aov', 'initial_aov']] = raw_df[['aov', 'initial_aov']].fillna(method = 'ffill') # propagate non-null values forward
raw_df
# raw_df.to_csv('/home/jhan405/cbcv/tft_google_ver4/jianlin-tft/data/preprocessed_data/tft_google/company_0_100_acq_initaov.csv')


In [5]:
import pandas as pd
raw_df = pd.read_csv('/home/jhan405/cbcv/tft_google_ver4/jianlin-tft/data/preprocessed_data/tft_google/company_0_1000_acq_initaov.csv') 
raw_df = raw_df.drop(columns = ['Unnamed: 0'])
# raw_df = raw_df.drop(columns = ['V1'])
raw_df = raw_df.sort_values(['merchant_index', 'acq_week'])

raw_df[['aov', 'initial_aov']] = raw_df[['aov', 'initial_aov']].fillna(method = 'ffill') # propagate non-null values forward
raw_df
# raw_df.to_csv('/home/jhan405/cbcv/tft_google_ver4/jianlin-tft/data/preprocessed_data/tft_google/company_0_1000_acq_initaov.csv')
