In [1]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# Train simple baseline (Linear Regression models)

metric: [MAPE](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)

In [2]:
# load data
df = pd.read_csv('ephemerides.csv', index_col=0)

In [3]:
# target variables columns 
target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
# features columns 
feature_cols = [c for c in df.columns if c not in target_cols and c != 'time']
# number of rows for training, validation and prediction
n_train = df.shape[0] - np.isnan(df[target_cols[0]]).sum()
n_val = int(0.2 * n_train)
n_train = n_train - n_val
n_test = df.shape[0] - n_train - n_val

print(f"n_train: {n_train}\nn_val: {n_val}\nn_test: {n_test}")
assert n_train + n_val + n_test == df.shape[0]

# train, validation, test arrays
X_train = df[feature_cols].values[:n_train]
y_train = dict(zip(target_cols, [df[l].values[:n_train] for l in target_cols]))
X_val = df[feature_cols].values[n_train:n_train+n_val]
y_val = dict(zip(target_cols, [df[l].values[n_train:n_train+n_val] for l in target_cols]))
X_test = df[feature_cols].values[n_train+n_val:]

assert X_train.shape[0] == y_train[target_cols[0]].shape[0] == n_train
assert X_val.shape[0] == y_val[target_cols[0]].shape[0] == n_val
assert X_test.shape[0] == n_test

n_train: 17520
n_val: 4380
n_test: 9400


In [7]:
# dictionary: {variable: LR predictions}
y_test = {}

_mape_train_list = []
_mape_val_list = []
for k, v in y_train.items():
    model = LinearRegression().fit(X_train, v)
    mape_train = np.mean(np.abs((model.predict(X_train) - v) / v)) * 100
    _mape_train_list.append(mape_train)
    mape_val = np.mean(np.abs((model.predict(X_val) - y_val[k]) / y_val[k])) * 100
    _mape_val_list.append(mape_val)
    model = LinearRegression().fit(
        np.vstack([X_train, X_val]), np.hstack([v, y_val[k]]))
    y_test[k] = model.predict(X_test)
    print(k)
    print("MAPE train: ", mape_train)
    print("MAPE val: ", mape_val, "\n")

print("------")
print("mean MAPE train: ", np.mean(_mape_train_list))
print("mean MAPE val: ", np.mean(_mape_val_list))

x
MAPE train:  2.7567727600464633
MAPE val:  9.020985138294844 

y
MAPE train:  1.9390680432773442
MAPE val:  0.9039356458499591 

z
MAPE train:  3.0091830469552856
MAPE val:  0.9205662275527899 

Vx
MAPE train:  1.4180215037857933
MAPE val:  2.366280615350265 

Vy
MAPE train:  1.787123028626488
MAPE val:  1.6511369096904502 

Vz
MAPE train:  1.9287174769356301
MAPE val:  1.8758708530039234 

------
mean MAPE train:  2.139814309937834
mean MAPE val:  2.7897958982903717


In [5]:
# predictions
predictions = pd.DataFrame(y_test, index=df.index[-n_test:])
predictions.to_csv('predictions.csv')