In [1]:
from sklearn.linear_model import LinearRegression as LR
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR as LSVR

# Train simple baseline (Linear Regression models)

metric: [MAPE](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)

In [2]:
# load data
df = pd.read_csv('ephemerides.csv', index_col=0)

In [3]:
# target variables columns 
target_cols = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
# features columns 
feature_cols = [c for c in df.columns if c not in target_cols and c != 'time']
# number of rows for training, validation and prediction
n_train = df.shape[0] - np.isnan(df[target_cols[0]]).sum()
n_val = int(0.2 * n_train)
n_train = n_train - n_val
n_test = df.shape[0] - n_train - n_val

print(f"n_train: {n_train}\nn_val: {n_val}\nn_test: {n_test}")
assert n_train + n_val + n_test == df.shape[0]

# train, validation, test arrays
X_train = df[feature_cols].values[:n_train]
y_train = dict(zip(target_cols, [df[l].values[:n_train] for l in target_cols]))
X_val = df[feature_cols].values[n_train:n_train+n_val]
y_val = dict(zip(target_cols, [df[l].values[n_train:n_train+n_val] for l in target_cols]))
X_test = df[feature_cols].values[n_train+n_val:]

assert X_train.shape[0] == y_train[target_cols[0]].shape[0] == n_train
assert X_val.shape[0] == y_val[target_cols[0]].shape[0] == n_val
assert X_test.shape[0] == n_test

n_train: 17520
n_val: 4380
n_test: 9400


In [4]:
from sklearn.base import BaseEstimator, RegressorMixin

class SplitModel(BaseEstimator, RegressorMixin):
    def __init__(self, model, **params):
        self._model = model
        self._params = params
        self._id_to_model = {}
        
    def fit(self, X, y):
        ids = np.unique(X[:, 0].astype("int32"))
        
        for sid in ids:
            mask = X[:, 0].astype("int32") == sid
            base = self._model(**self._params)
            self._id_to_model[sid] = base.fit(X[:, 1:][mask], y[mask])
        
        return self
    
    def predict(self, X):
        ids = np.unique(X[:, 0].astype("int32"))
        assert np.all([xid in self._id_to_model for xid in ids])
        
        
        prediction = np.zeros(X.shape[0])
        for sid in ids:
            mask = X[:, 0].astype("int32") == sid
            prediction[mask] = self._id_to_model[sid].predict(X[:, 1:][mask])
            
        return prediction

In [5]:
from sklearn.preprocessing import StandardScaler as SS

scaler = SS()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [6]:
# dictionary: {variable: LR predictions}
y_test = {}

_mape_train_list = []
_mape_val_list = []

params = {
    "x": {"C": 50., "random_state": 146},
    "y": {},
    "z": {}, 
    "Vx": {}, 
    "Vy": {}, 
    "Vz": {}
}

models = {
    "x": LSVR,
    "y": LR,
    "z": LR,
    "Vx": LR,
    "Vy": LR,
    "Vz": LR,
}

for k, v in y_train.items():
#for k, v in {"Vx": y_train["Vx"]}.items():
    model = models[k](**(params[k])).fit(X_train, v)
    mape_train = np.mean(np.abs((model.predict(X_train) - v) / v)) * 100
    _mape_train_list.append(mape_train)
    mape_val = np.mean(np.abs((model.predict(X_val) - y_val[k]) / y_val[k])) * 100
    _mape_val_list.append(mape_val)
    model = models[k](**(params[k])).fit(
        np.vstack([X_train, X_val]), np.hstack([v, y_val[k]]))
    y_test[k] = model.predict(X_test)
    print(k)
    print("MAPE train: ", mape_train)
    print("MAPE val: ", mape_val, "\n")

print("------")
print("mean MAPE train: ", np.mean(_mape_train_list))
print("mean MAPE val: ", np.mean(_mape_val_list))

x
MAPE train:  2.864334376253391
MAPE val:  4.303477513120269 

y
MAPE train:  1.9390680432773817
MAPE val:  0.9039356458500325 

z
MAPE train:  3.0091830469543233
MAPE val:  0.9205662275528902 

Vx
MAPE train:  1.4180215037853052
MAPE val:  2.3662806153444893 

Vy
MAPE train:  1.7871230285990598
MAPE val:  1.6511369096539334 

Vz
MAPE train:  1.9287174769308062
MAPE val:  1.8758708529952959 

------
mean MAPE train:  2.157741245966711
mean MAPE val:  2.0035446274194855


In [7]:
# predictions
predictions = pd.DataFrame(y_test, index=df.index[-n_test:])
predictions.to_csv('predictions.csv')