In [None]:
import numpy as np
import pandas as pd
import itertools
import importlib
import h5py
from astropy.stats import sigma_clip
import matplotlib.pyplot as plt
from scipy import stats
from scipy import special
from scipy import integrate
from scipy import interpolate
from scipy import linalg
from scipy import signal
from scipy.optimize import curve_fit
import time
from pathlib import Path
import os
import re
import random
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel
from typing import Tuple

plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['figure.dpi'] = 120
#plt.rcParams['text.usetex'] = True

import sys

sys.path.insert(0, '../..')

from modules import prh_mc_utils as pmu
importlib.reload(pmu)

In [None]:
comb_flux = pmu.mags_to_fluxsum

In [None]:
cut_lag_frac = 0.40
files = list(Path('../../data/cnn_base_data/original_data/').glob("*.txt"))
outdir = Path(f'../../data/cnn_base_data/qso_base_data')
outdir.mkdir(exist_ok=True)

for file in files:
    if 'HE0435' in file.name:
        imgs = list(itertools.combinations(['A', 'B', 'C', 'D'], 2))
    else:
        imgs = [('A', 'B')]
    
    for img1, img2 in imgs:
        qso_id = file.name.split('_')[0]
        qso_data = pd.read_table(file)
        images = [re.search(r'mag_([A-Z])', col).groups()[0] 
                  for col in qso_data.columns if re.search(r'mag_([A-Z])', col)]
        t = qso_data['mhjd'].to_numpy(dtype=np.float64)
        mags = {key: qso_data[f'mag_{key}'] for key in images}
        magerrs = {key: qso_data[f'magerr_{key}'] for key in images}
        
        mag1 = mags[img1]
        mag2 = mags[img2]
        magerr1 = magerrs[img1]
        magerr2 = magerrs[img2]

        y_input, err_input = comb_flux(mag1, mag2, magerr1, magerr2)


        qso_dict = {'t': t, 
                    f'{img1}': {'y': mag1, 'err_y': magerr1}, 
                    f'{img2}': {'y': mag2, 'err_y': magerr2},
                    f'{img1}+{img2}': {'y': y_input, 'err_y': err_input}
                   }
        
        kernel = ConstantKernel(2, (1e-3, 1e2)) * Matern(length_scale=200.0, length_scale_bounds=(1, 300), nu=1.5)

        gp = GaussianProcessRegressor(kernel=kernel, alpha=err_input**2, n_restarts_optimizer=10, 
                                      optimizer='fmin_l_bfgs_b', normalize_y=True)

        gp.fit(np.expand_dims(t,1), y_input)

        N = 2000
        dt_extension = 0
        support, step = np.linspace(t[0] - dt_extension, t[-1] + dt_extension, N, retstep=True)

        y_pred, cov_pred = gp.predict(np.expand_dims(support, 1), return_cov=True)
        sigma_pred = np.sqrt(np.diag(cov_pred))
        L = np.linalg.cholesky(cov_pred)
        win = int(dt_extension/step)

        gp_dict = {'t': support, 'y_pred': y_pred, 'sigma_pred': sigma_pred, 'cov_pred': cov_pred}
        
        tau, v = pmu.estimate_structure_func_from_data(support, y_pred, sigma_pred, n_bins=50)
        tau = tau[v>=0]
        v = v[v>=0]

        max_lag = cut_lag_frac*tau[-1]

        tau_cut = tau[tau <= max_lag]
        v_cut   = v[tau <= max_lag]

        p = stats.linregress(np.log10(tau_cut), np.log10(v_cut))

        sf_dict = {'tau_cut': tau_cut, 
                   'v_cut': v_cut,
                   'slope': p[0],
                   'intercept': p[1],
                   'tau_not_cut': tau,
                   'v_not_cut': v}
        
        pmu.create_qso_base_file(qso_dict=qso_dict, 
                                 gp_dict=gp_dict, 
                                 sf_dict=sf_dict, 
                                 outfile=Path(outdir/f'{qso_id}_{img1}{img2}_cut_{cut_lag_frac}.h5'))