In [1]:
#Astropy
import astropy
from astropy.io import fits
from astropy.table import Table

# dlnpyutils
# from dlnpyutils.utils import bspline, mad
from dlnpyutils import utils as dln

### Itertools
import itertools as it

# Matplotlib
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams.update({'font.size': 25})

#Numpy/Scipy
import numpy as np
from scipy.interpolate import InterpolatedUnivariateSpline as IUS
from scipy.interpolate import interp1d, interp2d
from scipy.optimize import curve_fit
from scipy.stats import binned_statistic, binned_statistic_2d

### fitting
import statsmodels.api as sm

# tqdm
from tqdm.notebook import tqdm

In [2]:
### Age of Universe
universe = 13.787 #+/- 0.020 Gyr 

# PARSEC

In [3]:
massive = fits.getdata('/Users/joshuapovick/Desktop/Research/parsec/parsec36_DR2_EDR3.fits')
massive = Table(massive[np.where(massive['label']==3.0)])
massive = massive[np.argsort(massive['logAge'])]
massive = massive['MH','Mass','logAge','logTe','logg','G_BPEDR3mag','GEDR3mag','G_RPEDR3mag',
                  'Jmag','Hmag','Ksmag']

In [4]:
# ### PCA
# from sklearn.decomposition import PCA, KernelPCA

# ### Setup 
# data = np.array([massive['G_BPEDR3mag'],massive['GEDR3mag'],massive['G_RPEDR3mag'],massive['Jmag'],massive['Hmag'],massive['Ksmag']]).T

# ### Determine Number of factors
# pca = PCA(n_components=6).fit(data)#_rescaled)
# plt.rcParams["figure.figsize"] = (12,6)

# fig, ax = plt.subplots()

# y = np.cumsum(pca.explained_variance_ratio_)
# xi = np.arange(1, len(pca.explained_variance_ratio_)+1, step=1)

# plt.ylim(0.0,1.1)
# plt.plot(xi, y, marker='o', linestyle='--', color='b')

# plt.xlabel('Number of Components')
# plt.xticks(np.arange(0, 7, step=1)) #change from 0-based array index to 1-based human-readable label
# plt.ylabel('Cumulative variance (%)')
# plt.title('The number of components needed to explain variance')

# plt.axhline(y=0.99, color='r', linestyle='-')
# plt.text(0.5, 0.85, '99% cut-off threshold', color = 'red', fontsize=16)

# ax.grid(axis='x')
# plt.show()

In [5]:
# new_phot = PCA(n_components=1).fit(data).transform(data)

In [6]:
# plt.figure(figsize=[10,10])
# plt.scatter(massive['GEDR3mag']-new_phot,massive['GEDR3mag'],c=massive['MH'],cmap='nipy_spectral')
# plt.gca().invert_yaxis()

In [7]:
# plt.figure(figsize=[10,10])
# plt.scatter(new_phot[:,1]-new_phot[:,0],new_phot[:,0],c=massive['MH'],cmap='nipy_spectral')
# plt.gca().invert_yaxis()

In [8]:
# new_phot[:,0]

In [9]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.DataFrame()

df['logAge'] = massive['logAge']

### Linear
df['MH'] = massive['MH']
df['logTe'] = massive['logTe']
df['logg'] = massive['logg']
df['GEDR3mag'] = massive['GEDR3mag']

# ### Linear Interactions
# df['MH_logTe'] = np.multiply(df['MH'],df['logTe'])
# df['MH_logg'] = np.multiply(df['MH'],df['logg'])
# df['MH_GEDR3mag'] = np.multiply(df['MH'],df['GEDR3mag'])
# df['logTe_logg'] = np.multiply(df['logTe'],df['logg'])
# df['logTe_GEDR3mag'] = np.multiply(df['logTe'],df['GEDR3mag'])
# df['logg_GEDR3mag'] = np.multiply(df['logg'],df['GEDR3mag'])

### Square
df['MHsq'] = df['MH']**2
df['logTesq'] = df['logTe']**2
df['loggsq'] = df['logg']**2
df['GEDR3magsq'] = df['GEDR3mag']**2

### Cubic
df['MHcu'] = df['MH']**3
df['logTecu'] = df['logTe']**3
df['loggcu'] = df['logg']**3
df['GEDR3magcu'] = df['GEDR3mag']**3

In [10]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [11]:
model = forward_selected(df, 'logAge')

In [13]:
model.model.formula

'logAge ~ GEDR3magcu + logg + GEDR3mag + logTe + logTesq + logTecu + MH + loggcu + MHsq + GEDR3magsq + MHcu + loggsq + 1'

In [12]:
# ### Split Training
# from sklearn.model_selection import train_test_split
# import pandas as pd

# df = massive.to_pandas()
# df.drop(columns=['Mass','logAge'])

# ptrain, ptest, atrain, atest = train_test_split(massive['MH','logTe','logg','G_BPEDR3mag','GEDR3mag','G_RPEDR3mag','Jmag','Hmag','Ksmag'],
#                                                 massive['logAge'],test_size=0.20)



In [13]:
# import seaborn as sns

# cov_mat = np.round(np.cov(np.array([massive['MH'],massive['logTe'],massive['logg'],massive['G_BPEDR3mag'],
#                                     massive['GEDR3mag'],massive['G_RPEDR3mag']])))

# plt.figure
# plt.imshow(cov_mat)
# plt.colorbar()
# plt.show()

# eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
# print("Eigenvector: \n",eigen_vectors,"\n")
# print("Eigenvalues: \n", eigen_values, "\n")

# variance_explained = []
# for i in eigen_values:
#      variance_explained.append((i/sum(eigen_values))*100)
        
# print(variance_explained)

# cumulative_variance_explained = np.cumsum(variance_explained)
# print(cumulative_variance_explained)

# plt.figure(figsize=[12,7])
# sns.lineplot(x = [1,2,3,4,5,6], y=cumulative_variance_explained)
# plt.xlabel("Number of components")
# plt.ylabel("Cumulative explained variance")
# plt.title("Explained variance vs Number of components")

In [None]:
### Find all models

### PCA
from sklearn.decomposition import PCA, KernelPCA

data = np.array([massive['logTe'],massive['GEDR3mag'],massive['MH'],massive['logg']]).T

#transform data to new basis
new_data = PCA(n_components=3).fit(data).transform(data)

#create new variables

# linear terms
x1 = new_data[:,0]
x2 = new_data[:,1]
x3 = new_data[:,2]

# linear int
x12 = np.multiply(x1,x2)
x13 = np.multiply(x1,x3)
x23 = np.multiply(x2,x3)

# squares
x1sq = x1**2
x2sq = x2**2
x3sq = x3**2

# cubes
x1cu = x1**3
x2cu = x2**3
x3cu = x3**3


#find all possible models
models = []
models_str = []

all_var_str = ['x1','x2','x3','x12','x13','x23','x1sq','x2sq','x3sq','x1cu','x2cu','x3cu']
all_var = [x1,x2,x3,x12,x13,x23,x1sq,x2sq,x3sq,x1cu,x2cu,x3cu]

for i in range(1,len(all_var)+1):
    for subset in it.combinations(all_var,i):
        models.append(subset)
    for subset_str in it.combinations(all_var_str,i):
        models_str.append(np.array(subset_str))
        
models = np.array(models)
models_str = np.array(models_str)

### Fit All Models

import statsmodels.api as sm 

all_params = []
summaries = []
max_resid = []
mads = []
predict = []
ll = []
for i in tqdm(range(len(models)),desc='Done?'):
    pmodl = np.array(models[i]).T
    pmodl = sm.add_constant(pmodl)
    model = sm.OLS(massive['logAge'],pmodl).fit()
    summaries.append(model.summary())
    predictions = model.predict(pmodl)
    predict.append(predictions)
    residual = predictions - massive['logAge']
    all_params.append(np.asarray(model.params))
    max_resid.append(np.max(np.absolute(residual)))
    mads.append(dln.mad(residual))
    ll.append(model.llf)

  models = np.array(models)
  models_str = np.array(models_str)


Done?:   0%|          | 0/4095 [00:00<?, ?it/s]

In [15]:
len(models_str)

261156

In [16]:
# ### Fit All Models

# import statsmodels.api as sm 

# dat = np.array([]).T

# all_params = []
# max_resid = []
# mads = []
# ll = []
# for i in tqdm(range(len(models)),desc='Done?'):
#     pmodl = np.array(models[i]).T
#     pmodl = sm.add_constant(pmodl)
#     model = sm.OLS(massive['logAge'],pmodl).fit()
#     predictions = model.predict(pmodl)
#     residual = predictions - massive['logAge']
#     all_params.append(np.asarray(model.params))
#     max_resid.append(np.max(np.absolute(residual)))
#     mads.append(dln.mad(residual))
#     ll.append(model.llf)

Done?:   0%|          | 0/261156 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [29]:
models_str[np.array(mads).argmin()]

array(['x1', 'x2', 'x3', 'x0sq', 'x1sq', 'x3sq', 'x1cu', 'x2cu', 'x3cu'],
      dtype='<U4')

In [19]:
pmodl = np.array([x0,x1,x2,x3,x0sq,x1sq,x2sq,x3sq,x0cu,x1cu,x2cu,x3cu]).T
pmodl = sm.add_constant(pmodl)
model = sm.OLS(massive['logAge'],pmodl).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.961
Model:,OLS,Adj. R-squared:,0.961
Method:,Least Squares,F-statistic:,5968000.0
Date:,"Sat, 06 Aug 2022",Prob (F-statistic):,0.0
Time:,12:18:08,Log-Likelihood:,3861700.0
No. Observations:,2916369,AIC:,-7723000.0
Df Residuals:,2916356,BIC:,-7723000.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9612.4450,4.827,-1991.368,0.000,-9621.906,-9602.984
x1,-0.1682,0.000,-411.713,0.000,-0.169,-0.167
x2,7696.1086,3.968,1939.438,0.000,7688.331,7703.886
x3,-2.3302,0.001,-2056.261,0.000,-2.332,-2.328
x4,1.1072,0.000,5470.296,0.000,1.107,1.108
x5,-0.0375,0.000,-134.739,0.000,-0.038,-0.037
x6,-2051.4036,1.088,-1886.215,0.000,-2053.535,-2049.272
x7,0.0052,0.000,11.623,0.000,0.004,0.006
x8,0.0045,4.2e-05,107.325,0.000,0.004,0.005

0,1,2,3
Omnibus:,1051382.186,Durbin-Watson:,0.446
Prob(Omnibus):,0.0,Jarque-Bera (JB):,520043390.777
Skew:,0.2,Prob(JB):,0.0
Kurtosis:,68.418,Cond. No.,8890000.0


In [None]:
np.cov(np.array([x1,x2,x3,x0sq,x1sq,x3sq,x1cu,x2cu,x3cu])

In [None]:
def str2eq(model_str,model_params):
    '''
    Create polynomial model using string and parameters included. Assuming the existence of a constant term
    and the following conventions
    
    xN: linear term
    xNsq: square term
    xNcu: cubic term
    
    Input:
    -----
        model_str:    array of length N
                      strings of model variables
                      
        model_params: array of length N+1
                      model parameters with model_params[0] as the constant term and every other as the 
                      coresponding value for each element of model_str
    '''
    
    var =
    deg = 999999.0*np.
    
    for i in range(len(model_str)):
        if model_str[:-2] == 'cu':
            deg

In [None]:
'x1cu'[-2:]

01,02,03
12,13
23