# Backup of Code

In [None]:
# Use Random Forest and other to forecast Revenues more accurately
# Main values are: levels, diffs, diff of diffs

# Forecast based on moving average? Apply seasonal patterns later on!



df_model = df_fundamentals.copy()

# Keep relevant columns
df_model = df_model[['Symbol', 'Y_Q', 'Revenue']]

# Calculate moving average
df_model['Rev_MA'] = df_model.groupby(['Symbol'])[['Revenue']].rolling(4, min_periods=4).mean().reset_index(drop=True)

df_model.dropna(subset=['Rev_MA'], inplace=True)

# Calculate features
def calc_features(df_model, calc_levels = True):


    if calc_levels:
        ## Levels
        for i in range(4):
            df_model['L'+str(i + 1)] = df_model.groupby(['Symbol'])['Rev_MA'].shift(i+1)

    ## Average of levels - Mean Reversion?
    # df_model['L1_L8_Mean'] = np.mean()

    ## Differences

    # Calculate differences
    df_model['L1_L2'] = df_model['L1'] - df_model['L2']
    df_model['L1_L3'] = df_model['L1'] - df_model['L3']
    df_model['L1_L4'] = df_model['L1'] - df_model['L4']

    df_model['L2_L3'] = df_model['L2'] - df_model['L3']
    df_model['L2_L4'] = df_model['L2'] - df_model['L4']

    df_model['L3_L4'] = df_model['L3'] - df_model['L4']



    # Calculate relative differences
    df_model['Rel_L1_L2'] = df_model['L1_L2'] / df_model['L2']
    df_model['Rel_L1_L3'] = df_model['L1_L3'] / df_model['L3']
    df_model['Rel_L1_L4'] = df_model['L1_L4'] / df_model['L4']

    df_model['Rel_L2_L3'] = df_model['L2_L3'] / df_model['L3']
    df_model['Rel_L2_L4'] = df_model['L2_L4'] / df_model['L4']

    df_model['Rel_L3_L4'] = df_model['L3_L4'] / df_model['L4']

    # Return
    return df_model

df_model = calc_features(df_model, calc_levels = True)

# Remove NA values - obs with not enough lagged data
df_model = df_model.dropna()

# Reset index
df_model.reset_index(drop=True, inplace=True)




In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error

In [None]:
train = df_model[df_model.Y_Q < '2018-01-01'].set_index(['Symbol', 'Y_Q'])#, axis = 1)
test = df_model[df_model.Y_Q >= '2018-01-01'].set_index(['Symbol', 'Y_Q'])#, axis = 1)

X_train = train.drop(['Rev_MA'], axis = 1)
X_test = test.drop(['Rev_MA'], axis = 1)

Y_train = train['Rev_MA']
Y_test = test['Rev_MA']

In [None]:
# Baseline model

# Create model class for baseline model
class model_baseline:
    def __init__(self):
        self.name="baseline"

    def predict(self, X_):
        return X_['L1']

model_base = model_baseline()
        

score_base = mean_absolute_percentage_error(Y_test, test["L1"])
score_base

In [None]:
# First, simple model
model_rf = RandomForestRegressor(n_estimators=10, criterion='absolute_error', n_jobs=-1, random_state=0)
model_rf.fit(X_train, Y_train)

p = model_rf.predict(X_test)
p_train = model_rf.predict(X_train)

score_rf_first_guess = mean_absolute_percentage_error(Y_test, p)
score_rf_first_guess

In [None]:
plot_test = Y_test.copy()
plot_train = Y_train.copy()

plot_test = plot_test.reset_index()
plot_train = plot_train.reset_index()

plot_test['pred'] = p
plot_train['pred'] = p_train

plot_test['Set'] = 'test'
plot_train['Set'] = 'train'

In [None]:
plot = plot_train.append(plot_test).sort_values(['Symbol', 'Y_Q']).reset_index(drop=True)

sym='XOM'

sns.lineplot(data=plot[plot.Symbol==sym], x='Y_Q', y='Rev_MA')
sns.lineplot(data=plot[plot.Symbol==sym], x='Y_Q', y='pred')
# plot[plot.Symbol=='AAPL']

In [None]:
# Translate Y_Q to Date_IDs
y_q_id = pd.Series(pd.date_range(start="2000-01-01", end="2100-01-01", freq='Q'))
y_q_id = pd.PeriodIndex(y_q_id, freq = 'Q').to_timestamp()
df_date_id = pd.DataFrame(y_q_id, columns=['Y_Q'])
df_date_id['Y_Q_ID'] = df_date_id.index
df_date_id.head()

In [None]:
# Predict multiple periods !!!!

def predict_mult_periods(model, y_q_start, y_q_end, n_periods):

    # model, y_q_start, y_q_end, n_periods = model_rf, "2020-01-01", "2020-10-01", 8

    df_pred = df_model.copy()

    # Map Date IDs
    df_pred = df_pred.join(df_date_id.set_index('Y_Q'), on='Y_Q')
    df_pred = df_pred.drop(['Y_Q'], axis=1)

    df_pred['Y_Q_ID_Base'] = df_pred['Y_Q_ID']

    # # Determine y_q range
    y_q_id_start = int(df_date_id[df_date_id.Y_Q==y_q_start]['Y_Q_ID'])
    y_q_id_end = int(df_date_id[df_date_id.Y_Q==y_q_end]['Y_Q_ID'])

    for y_q_i in range(y_q_id_start, y_q_id_end+1):

        # Predict starting in year 2018 for n years
        df_pred_i = df_pred[df_pred.Y_Q_ID_Base == y_q_i].copy()


        for year_pred_i in range(y_q_i + 1, y_q_i + n_periods):

            # Extract base (previous) year data
            df_pred_i_add = df_pred_i.loc[df_pred_i.Y_Q_ID == year_pred_i - 1].copy()
            
            # Predict for the current year
            X_ = df_pred_i_add.drop(['Symbol', 'Y_Q_ID', 'Y_Q_ID_Base', 'Rev_MA'], axis=1)

            pred_i = model.predict(X_)

            # Enter year of prediction
            df_pred_i_add['Y_Q_ID'] = year_pred_i

            # # Copy level values -> information before base_year is missing!
            df_pred_i_add['L4'] = df_pred_i_add['L3']
            df_pred_i_add['L3'] = df_pred_i_add['L2']
            df_pred_i_add['L2'] = df_pred_i_add['L1']
            df_pred_i_add['L1'] = df_pred_i_add['Rev_MA']

            # Enter prediction value AFTER infering L1 value!
            df_pred_i_add['Rev_MA'] = pred_i

            # Add current prediction to base_year collection
            df_pred_i = df_pred_i.append(df_pred_i_add)

            # Recalculate feature columns
            df_pred_i = calc_features(df_pred_i, calc_levels=False)

        df_pred = df_pred.append(df_pred_i[df_pred_i.Y_Q_ID != df_pred_i.Y_Q_ID_Base])

    return df_pred


In [None]:
df_plot = predict_mult_periods(model_rf, "2020-01-01", "2021-10-01", 40)

sym = 'MSFT'

df_plot_i = df_plot[df_plot.Symbol == sym].reset_index(drop=True).copy()

df_plot_base = df_plot_i[df_plot_i.Y_Q_ID == df_plot_i.Y_Q_ID_Base].copy()

# # Reduce time horizon
# df_plot_base = df_plot_base[df_plot_base.Year >= (min_year_base - 4)]

y_q_id_base = int(df_date_id[df_date_id.Y_Q=="2021-01-01"]['Y_Q_ID'])

# ax_x, ax_y = ax_loc[i_]
fig, ax = plt.subplots(figsize=(20,10))
sns.lineplot(data=df_plot_base, x='Y_Q_ID', y='Rev_MA', color='black')
sns.lineplot(data=df_plot_i[df_plot_i.Y_Q_ID_Base==y_q_id_base], x='Y_Q_ID', y='Rev_MA', color='red')

# ax[ax_x,ax_y].plot(df_plot_base['Year'], df_plot_base['Dist_Amount'], color='black')

# for i__, year_base_i in enumerate(range(min_year_base, min_year_base + 4)):
#     df_plot_i_ = df_plot_i[df_plot_i.Year_Base == year_base_i]
#     ax[ax_x,ax_y].plot(df_plot_i_['Year'], df_plot_i_['Dist_Amount'], color=colors[i__])