### Import

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

### Read Data

In [2]:
data = pd.read_csv("cars.csv", parse_dates=["date", "retrain_frequency"])
data = data.sort_values(["ticker", "date"]).reset_index(drop=True)

### Data Transformation

In [3]:
# if total_locations is inf or nan, equals to the last observed numbers by each ticker
cars = data.copy()
cars["total_locations"] = cars["total_observations"] / (cars["percent_of_locations_observed"] / 100)
cars["total_locations"] = cars["total_locations"].replace([np.inf, -np.inf], np.nan)
cars["total_locations"] = cars.groupby("ticker")["total_locations"].fillna(method='ffill')
cars["total_locations"] = cars["total_locations"].astype(int)

In [4]:
# keep the data before 7 days before reporting
reduced_cars = cars[cars["date"] <= cars["retrain_frequency"] - dt.timedelta(days=7)].reset_index(drop=True)
col = list(reduced_cars.columns)
new_col = col[:6] + ["total_locations"] + col[6:-1]
reduced_cars = reduced_cars[new_col]

### Daily Features

In [5]:
# build a helper dictionary
# keys -> tickers
# values -> report time of that ticker
ticker_date = {}
for tic in reduced_cars.ticker.unique():
    dates = list(reduced_cars[reduced_cars["ticker"] == tic].retrain_frequency.unique())
    ticker_date[tic] = dates

In [6]:
# transform the dataframe
daily = []
for tic in ticker_date.keys():
    for date in ticker_date[tic]:
        df = reduced_cars.query("ticker == @tic and retrain_frequency == @date")
        vals = [tic, date, df.iloc[0]["target"], df.iloc[0]["consensus"]]
        cols = ["ticker", "date", "target", "consensus"]
        feature_names = list(df.columns[2:7])
        for i in range(90):
            try:
                cols = cols + ["day" + str(i+1) + "_" + col for col in feature_names]
                vals = vals + list(df.iloc[i][feature_names].values)
            except:
                vals = vals + [""]*5
        daily.append(vals)

daily_df = pd.DataFrame(daily, columns=cols)

### Quarterly Features (Min, Max, Sum, Mean)

In [7]:
quarterly_df_1 = daily_df[["ticker", "date", "consensus", "target"]].copy()
for col in reduced_cars.columns[2:7]:
    df = pd.DataFrame()
    df_sum = reduced_cars.groupby(["ticker", "retrain_frequency"]).sum()[col].reset_index()
    df_sum = df_sum.rename(columns={col: "sum_" + col, "retrain_frequency": "date"})
    df_min = reduced_cars.groupby(["ticker", "retrain_frequency"]).min()[col].reset_index()
    df_min = df_min.rename(columns={col: "min_" + col, "retrain_frequency": "date"})
    df_max = reduced_cars.groupby(["ticker", "retrain_frequency"]).max()[col].reset_index()
    df_max = df_max.rename(columns={col: "max_" + col, "retrain_frequency": "date"})
    df_mean = reduced_cars.groupby(["ticker", "retrain_frequency"]).mean()[col].reset_index()
    df_mean = df_mean.rename(columns={col: "mean_" + col, "retrain_frequency": "date"})
    df = df_sum.merge(df_min).merge(df_max).merge(df_mean)
    quarterly_df_1 = quarterly_df_1.merge(df)
    if col == "observed":
        del quarterly_df_1["min_" + col]
        del quarterly_df_1["max_" + col]

### Quarterly Features (Kurtosis, Skew, Median, Standard Deviation, 25% Quantile, 75% Quantile)

In [8]:
quarterly_df_2 = daily_df[["ticker", "date", "consensus", "target"]].copy()
for col in reduced_cars.columns[2:7]:
    if col != "observed":
        df = pd.DataFrame()
        df_mid = reduced_cars.groupby(["ticker", "retrain_frequency"]).median()[col].reset_index()
        df_mid = df_mid.rename(columns={col: "mid_" + col, "retrain_frequency": "date"})
        df_kur = reduced_cars.groupby(["ticker", "retrain_frequency"]).apply(pd.DataFrame.kurt)[col].reset_index()
        df_kur = df_kur.rename(columns={col: "kur_" + col, "retrain_frequency": "date"})
        df_ske = reduced_cars.groupby(["ticker", "retrain_frequency"]).skew()[col].reset_index()
        df_ske = df_ske.rename(columns={col: "ske_" + col, "retrain_frequency": "date"})
        df_std = reduced_cars.groupby(["ticker", "retrain_frequency"]).std()[col].reset_index()
        df_std = df_std.rename(columns={col: "std_" + col, "retrain_frequency": "date"})
        df_quant_25 = reduced_cars.groupby(["ticker", "retrain_frequency"]).quantile(q=0.25)[col].reset_index()
        df_quant_25 = df_quant_25.rename(columns={col: "quant_25_" + col, "retrain_frequency": "date"})
        df_quant_75 = reduced_cars.groupby(["ticker", "retrain_frequency"]).quantile(q=0.75)[col].reset_index()
        df_quant_75 = df_quant_75.rename(columns={col: "quant_75_" + col, "retrain_frequency": "date"})
        df = df_mid.merge(df_kur).merge(df_ske).merge(df_std).merge(df_quant_25).merge(df_quant_75)
        quarterly_df_2 = quarterly_df_2.merge(df)

### Write to CSV

In [9]:
# daily features
daily_df.to_csv("daily_feature_with_cons.csv", index=False)
daily_df.drop(["consensus"], axis=1).to_csv("daily_feature_no_cons.csv", index=False)

In [10]:
# all quarterly features
quarterly_df = quarterly_df_1.merge(quarterly_df_2)
quarterly_df.to_csv("quarterly_feature_with_cons.csv", index=False)
quarterly_df.drop(["consensus"], axis=1).to_csv("quarterly_feature_no_cons.csv", index=False)

In [11]:
# quarterly features: sum, min, max, mean
quarterly_df_1.to_csv("quarterly_feature_1_with_cons.csv", index=False)
quarterly_df_1.drop(["consensus"], axis=1).to_csv("quarterly_feature_1_no_cons.csv", index=False)

In [12]:
# quarterly features: median, 25% quantile, 75% quantile, kurtosis, skew, standard deviation
quarterly_df_2.to_csv("quarterly_feature_2_with_cons.csv", index=False)
quarterly_df_2.drop(["consensus"], axis=1).to_csv("quarterly_feature_2_no_cons.csv", index=False)

### Result Evaluation

In [41]:
# read in results from xyme
result_path = "./results/"
q1_xgb_cons = pd.read_csv(result_path + "josh_cars_q_1_xgb_cons.csv", parse_dates=["date"])[["ticker", "date", "preds", "truths"]].sort_values(["ticker", "date"]).reset_index(drop=True)
q2_xgb_cons = pd.read_csv(result_path + "josh_cars_q_2_xgb_cons.csv", parse_dates=["date"])[["ticker", "date", "preds", "truths"]].sort_values(["ticker", "date"]).reset_index(drop=True)
q1_xgb_no_cons = pd.read_csv(result_path + "josh_cars_q_1_xgb_no_cons.csv", parse_dates=["date"])[["ticker", "date", "preds", "truths"]].sort_values(["ticker", "date"]).reset_index(drop=True)
q2_xgb_no_cons = pd.read_csv(result_path + "josh_cars_q_2_xgb_no_cons.csv", parse_dates=["date"])[["ticker", "date", "preds", "truths"]].sort_values(["ticker", "date"]).reset_index(drop=True)

In [42]:
# merge consensus with results for further analysis
consensus = quarterly_df[["ticker", "date", "consensus"]].sort_values(["ticker", "date"]).reset_index(drop=True)
q1_xgb_cons_reslt = q1_xgb_cons.merge(consensus, how="left")
q2_xgb_cons_reslt = q2_xgb_cons.merge(consensus, how="left")
q1_xgb_no_cons_reslt = q1_xgb_no_cons.merge(consensus, how="left")
q2_xgb_no_cons_reslt = q2_xgb_no_cons.merge(consensus, how="left")

In [43]:
# directional accuracy
def dir_acc(x):
    if np.sign(x["preds"] - x["consensus"]) == np.sign(x["truths"] - x["consensus"]):
        return 1
    else:
        return 0

def err_acc(x):
    if abs(x["preds"] - x["truths"]) < abs(x["consensus"] - x["truths"]):
        return 1
    else:
        return 0
    
for df in [q1_xgb_cons_reslt, q2_xgb_cons_reslt, q1_xgb_no_cons_reslt, q2_xgb_no_cons_reslt]:
    df["dir_accuracy"] = df.apply(lambda x: dir_acc(x), axis=1)
    df["err_accuracy"] = df.apply(lambda x: err_acc(x), axis=1)

In [44]:
# q_1_xgb_cons
q1_xgb_cons_reslt["dir_accuracy"].mean(), q1_xgb_cons_reslt["err_accuracy"].mean()

(0.5490196078431373, 0.4803921568627451)

In [45]:
# q_1_xgb_no_cons
q1_xgb_no_cons_reslt["dir_accuracy"].mean(), q1_xgb_no_cons_reslt["err_accuracy"].mean()

(0.5882352941176471, 0.5098039215686274)

In [46]:
# q_2_xgb_cons
q2_xgb_cons_reslt["dir_accuracy"].mean(), q2_xgb_cons_reslt["err_accuracy"].mean()

(0.5686274509803921, 0.47058823529411764)

In [47]:
# q_2_xgb_no_cons
q2_xgb_no_cons_reslt["dir_accuracy"].mean(), q2_xgb_no_cons_reslt["err_accuracy"].mean()

(0.5294117647058824, 0.39215686274509803)