In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import time
from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'pandas.core.config_init'

In [None]:
import utils

# Read data

In [None]:
metadata = pd.read_csv('../data/plasticc/plasticc_train_metadata.csv.gz')
data = pd.read_csv('../data/plasticc/plasticc_train_lightcurves.csv.gz')

In [None]:
data = data[data['detected_bool'] == 1]
metadata = metadata[metadata['ddf_bool'] == 1]

In [None]:
metadata.head()

In [None]:
data.head()

In [None]:
object_ids = np.unique(data.object_id)

In [None]:
passband2name = {0: 'u', 1: 'g', 2: 'r', 3: 'i', 4: 'z', 5: 'y'}
passband2lam  = {0: np.log10(3751.36), 1: np.log10(4741.64), 2: np.log10(6173.23), 
                 3: np.log10(7501.62), 4: np.log10(8679.19), 5: np.log10(9711.53)}

In [None]:
ss = StandardScaler()

# Visual analysis

In [None]:
def get_object(data, object_id):
    anobject = data[data.object_id == object_id]
    return anobject

In [None]:
def approximate_model(reg, anobject):
    
    anobject = flux_not_negativ(anobject)
    reg.fit(anobject['mjd'].values, anobject['flux'].values, 
            anobject['flux_err'].values, anobject['passband'].values)

    approx_object = create_approx_object(anobject)
    approx_object['flux'], approx_object['flux_err'] = reg.predict(approx_object['mjd'].values, approx_object['passband'].values)
    
    return approx_object

In [None]:
def plot_light_curves_compare(orig_object, approx_objects, titles=None):
    if titles is None:
        titles = [""] * (1 + len(approx_objects))
    plot_light_curves(orig_object, titles[0])
        
    plt.figure(figsize=(20, 4))
    n_obj = len(approx_objects)
    for idx, anobject in enumerate(approx_objects, 1):
        plt.subplot(1, n_obj, idx)
        anobject = anobject.sort_values('mjd')
        for passband in range(6):
            light_curve = get_passband(anobject, passband)
            plt.plot(light_curve['mjd'].values, light_curve['flux'].values, linewidth=0.5)
            plt.scatter(light_curve['mjd'].values, light_curve['flux'].values, label=passband2name[passband], linewidth=1)
        plt.xlabel('Modified Julian Date', size=14)
        plt.xticks(size=14, rotation=45)
        plt.ylabel('Flux', size=14)
        plt.yticks(size=14)
        plt.legend(loc='best', ncol=3, fontsize=14)
        plt.title(titles[idx], size=14)
    plt.show()

    

In [None]:
def get_passband(anobject, passband):
    light_curve = anobject[anobject.passband == passband]
    return light_curve

In [None]:
def add_log_lam(anobject):
    passbands = anobject.passband.values
    log_lam = [passband2lam[i] for i in passbands]
    anobject_new = anobject.copy()
    anobject_new.loc[:, 'log_lam'] = np.array(log_lam)
    return anobject_new

In [None]:
def create_approx_object(anobject, n=1000):
    mjd = anobject['mjd'].values
    dfs = []
    for passband in range(6):
        df = pd.DataFrame()
        df['mjd'] = np.linspace(mjd.min(), mjd.max(), n)
        df['object_id'] = 0
        df['passband'] = passband
        df['flux'] = 0
        df['flux_err'] = 0
        df['detected_bool'] = 1
        dfs.append(df)
    new_object = pd.concat(dfs, axis=0)
    new_object = add_log_lam(new_object)
    return new_object

In [None]:
def is_good(anobject):
    good = 1
    
    # remove all objects with negative flux values
    if anobject['flux'].values.min() < 0:
        good = 0
    
    # keep only objects with at least 10 observations in at least 3 passbands
    count = 0
    for passband in range(6):
        if len(get_passband(anobject, passband)) < 10:
            count += 1
    if count > 3:
        good = 0
        
    # keep only objects without large breaks in observations
    anobject = anobject.sort_values('mjd')
    mjd = anobject['mjd'].values
    if np.diff(mjd, 1).max() > 50:
        good = 0
    
    return good

In [None]:
def plot_light_curves(anobject, title="", size=(9, 4)):
    anobject = anobject.sort_values('mjd')
    plt.figure(figsize=size)
    for passband in range(6):
        light_curve = get_passband(anobject, passband)
        plt.plot(light_curve['mjd'].values, light_curve['flux'].values, linewidth=0.5)
        plt.scatter(light_curve['mjd'].values, light_curve['flux'].values, label=passband2name[passband], linewidth=1)
    plt.xlabel('Modified Julian Date', size=14)
    plt.xticks(size=14)
    plt.ylabel('Flux', size=14)
    plt.yticks(size=14)
    plt.legend(loc='best', ncol=3, fontsize=14)
    plt.title(title, size=14)
    plt.show()

In [None]:
def scaler_transform(anobject):
    X = ss.fit_transform(anobject[['mjd', 'log_lam']].values)
    y = anobject['flux'].values
    return X, y

In [None]:
def compile_obj(t, flux, flux_err, passband):
    obj = pd.DataFrame()
    obj['mjd']      = t
    obj['flux']     = flux
    obj['flux_err'] = flux
    obj['passband'] = passband
    return obj

In [None]:
def flux_not_negativ(anobject):
    anobject_new = anobject.copy()
    anobject_new['flux'] = np.where(anobject_new['flux'] < 0, 0, anobject_new['flux'])
    return anobject_new

In [None]:
for i in object_ids[::50]:
    anobject = get_object(data, i)
    
    if not is_good(anobject): continue
    print(i)
    
    plot_light_curves(anobject)
    plt.show()

# One object test Test

In [None]:
anobject = get_object(data, 745)#43812 34299
anobject = add_log_lam(anobject)

plot_light_curves(anobject)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
anobject_train, anobject_test = train_test_split(anobject, test_size=0.5, random_state=11)

In [None]:
kernel = 'LR'

In [None]:
import LR_aug

timer = time.time()
anobject_train = flux_not_negativ(anobject_train)
model = LR_aug.LinearRegressionAugmentation(passband2lam, mod=kernel)
model.fit(anobject_train['mjd'].values, anobject_train['flux'].values, 
          anobject_train['flux_err'].values, anobject_train['passband'].values)

flux_pred, flux_err_pred = model.predict(anobject_test['mjd'].values, anobject_test['passband'].values, copy=True)

t_aug, flux_aug, flux_err_aug, passband_aug = model.augmentation(anobject['mjd'].min(), 
                                                                 anobject['mjd'].max(), n_obs=1000)

In [None]:
model.score(anobject_train['mjd'].values, anobject_train['flux'].values, 
            anobject_train['flux_err'].values, anobject_train['passband'].values)

In [None]:
anobject_test_pred = compile_obj(anobject_test['mjd'].values, flux_pred, 
                                 flux_err_pred, anobject_test['passband'].values)
anobject_aug = compile_obj(t_aug, flux_aug, flux_err_aug, passband_aug)

In [None]:
plot_light_curves(anobject_test, "Test observations")

In [None]:
plot_light_curves(anobject_test_pred, "Predictions on test")
plot_light_curves(anobject_aug, "Approximation")

In [None]:
[rmse, mae, rse, rae, mape] = utils.regression_quality_metrics_report(anobject_test['flux'].values, 
                                                                      anobject_test_pred['flux'].values)
print("RMSE: ", rmse)
print("MAE: ", mae)
print("RSE: ", rse)
print("RAE: ", rae)
print("MAPE: ", mape)
print("Work sec: %f" % (time.time() - timer))

# Test on many objects

In [None]:
report = pd.DataFrame(columns=["ID", 'RMSE', 'MAE', 'RSE', 'RAE', 'MAPE'])

for i in object_ids: 
    # get an object
    anobject_i = get_object(data, i)
    anobject_i = add_log_lam(anobject_i)

    if not is_good(anobject_i): continue

    print("Object ", i, " with ", len(anobject_i), " observations")

    # train / test split
    anobject_train, anobject_test = train_test_split(anobject_i, test_size=0.5, random_state=11)

    anobject_train = flux_not_negativ(anobject_train)
    model = LR_aug.LinearRegressionAugmentation(passband2lam, mod=kernel)
    model.fit(anobject_train['mjd'].values, anobject_train['flux'].values, 
              anobject_train['flux_err'].values, anobject_train['passband'].values)

    flux_pred, flux_err_pred = model.predict(anobject_test['mjd'].values, anobject_test['passband'].values, copy=True)
    
    anobject_test_pred = compile_obj(anobject_test['mjd'].values, flux_pred, flux_err_pred, anobject_test['passband'].values)
    
    metric = utils.regression_quality_metrics_report(anobject_test['flux'].values, anobject_test_pred['flux'].values)
    report.loc[len(report), :] = [i] + metric


In [None]:
np.round(report.mean(), 2)

In [None]:
report

# Comparison of models

In [None]:
mape_line = pd.DataFrame(columns=["ID", 'model', 'MAPE'])
for i in object_ids: 
    # get an object
    anobject_i = get_object(data, i)
    anobject_i = add_log_lam(anobject_i)

    if not is_good(anobject_i): continue
    key = ["Ridge", "Lasso", "LR", "ElasticNet"]
    for name in key:
        # train / test split
        anobject_train, anobject_test = train_test_split(anobject_i, test_size=0.5, random_state=11)

        anobject_train = flux_not_negativ(anobject_train)
        model = LR_aug.LinearRegressionAugmentation(passband2lam, mod=name)
        model.fit(anobject_train['mjd'].values, anobject_train['flux'].values, 
                  anobject_train['flux_err'].values, anobject_train['passband'].values)

        flux_pred, flux_err_pred = model.predict(anobject_test['mjd'].values, anobject_test['passband'].values, copy=True)

        anobject_test_pred = compile_obj(anobject_test['mjd'].values, flux_pred, flux_err_pred, anobject_test['passband'].values)

        [rmse, mae, rse, rae, mape] = utils.regression_quality_metrics_report(anobject_test['flux'].values, anobject_test_pred['flux'].values)
        mape_line.loc[len(mape_line), :] = [i] + [name] + [mape]
    

In [None]:
mape_line

In [None]:
mape_line_ridge = mape_line[mape_line['model'] == "Ridge"]
mape_line_lasso = mape_line[mape_line['model'] == "Lasso"]
mape_line_lr = mape_line[mape_line['model'] == "LR"]
mape_line_en = mape_line[mape_line['model'] == "ElasticNet"]

mape_line_ridge.index = np.arange(len(mape_line_ridge))
mape_line_lasso.index = np.arange(len(mape_line_lasso))
mape_line_lr.index = np.arange(len(mape_line_lr))
mape_line_en.index = np.arange(len(mape_line_en))

In [None]:
for i in range(0, len(mape_line_ridge)-43, 42):
    plt.figure(figsize=(6, 4))
    plt.plot(mape_line_ridge[i:i+50].index, mape_line_ridge['MAPE'][i:i+50].values, ':b', label='Ridge')
    plt.plot(mape_line_lasso[i:i+50].index, mape_line_lasso['MAPE'][i:i+50].values, '--r', label='Lasso')
    plt.plot(mape_line_lr[i:i+50].index, mape_line_lr['MAPE'][i:i+50].values, 'g', label='LR')
    plt.plot(mape_line_en[i:i+50].index, mape_line_en['MAPE'][i:i+50].values, '-.y', label='ElasticNet')
    plt.xlabel('ID')
    plt.ylabel('MAPE')
    plt.title('Mean absolute percentage error model')
    plt.legend()
    plt.show()

In [None]:
print("MAPE with Ridge:", mape_line_ridge['MAPE'].mean())
print("MAPE with Lasso:", mape_line_lasso['MAPE'].mean())
print("MAPE with LR:", mape_line_lr['MAPE'].mean())
print("MAPE with ElasticNet:", mape_line_en['MAPE'].mean())

In [None]:
regs = [
    LR_aug.LinearRegressionAugmentation(passband2lam, "Ridge"),
    LR_aug.LinearRegressionAugmentation(passband2lam, "Lasso"),
    LR_aug.LinearRegressionAugmentation(passband2lam, "LR"),
    LR_aug.LinearRegressionAugmentation(passband2lam, "ElasticNet")
]
for i in object_ids[::50]:
    anobject = get_object(data, i)
    anobject = add_log_lam(anobject)

    if not is_good(anobject): continue
    print(i)

    objects = [approximate_model(reg, anobject) for reg in regs]
    
    plot_light_curves_compare(anobject, objects, ['Original', "Ridge", 'Lasso', "LR", "ElasticNet"])
    plt.show()