In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import scipy.optimize as optimize
import datetime

from functools import partial

plt.rcParams['font.size'] = 14

# Project:  writing and testing an algorithm to pick initial parameters for a fit

Recall that in notebook '06_02_SDSS_Specta' we did fit to a spectral line in the SDSS data that included both a  signal and a background part in the model.

Recall also that we picked initial values for the fit parameters looking at the plot at estimating values, and that there was a question about how you might write an algorithm to guess the initial parameters automatically.

This is your chance to do that, and this notebook has some of the pieces you will need.

Some goals for this project might include:

1. Write a function to extract initial guesses for the fit parameters based on the raw data associated to a single peak.

2. Test the function out on a few different peaks, and estiamte how close you have to be to the correct values of the parameters for the fitter to succeed.

3. Have your code automatically identify peaks within the spectrum, downselect the data around the peak, and fit each individually in sequence using the techniques you developed.

4. (optional) Include a "flag" in the output of your fitting routine stating whether a particular peak has a "good fit". Since some peaks have more complex structure (like the triplet in the H-alpha line), this flag might indicate when further modeling work is required. You'll probably need to bootstrap some uncertainties as part of this objective.

In any/all of these tasks, 100% fidelity is not required, but fidelity should be commented on in some way (what limits it, how can we improve it).

### Cell to read in the data and put it into two arrays

In [None]:
data = np.loadtxt(open("../data/sdss_galaxy.txt", 'rb'), usecols=range(4))

## This is how we pull out the data from columns in the array.

## They put the data in Angstroms, lets use nano-meters instead, 1 Angstrom = 0.1 nm. 
wavelength = data[:,0] / 10.
flux = data[:,1]
#best_fit = data[:,2]
#sky_flux = data[:,3]

### Function to select a sub-set of the data over a smaller set of wavelengths

In [None]:
def selectData(wavelength, flux, lamb_lo, lamb_hi):
    idx_lo = np.argmin(np.abs(wavelength - lamb_lo))
    idx_hi = np.argmin(np.abs(wavelength - lamb_hi))
    return wavelength[idx_lo:idx_hi], flux[idx_lo:idx_hi]

### Functions used to define the model and the cost function

In [None]:
def Gauss(x, prefact, mu, sigma):
    return prefact*stats.norm(loc=mu, scale=sigma).pdf(x)

def poly1(x, ref_lambda, offset, slope):
    return offset + (x-ref_lambda)*slope

def model_func(x, ref_lambda, prefact, mu, sigma, offset, slope):
    return Gauss(x, prefact, mu, sigma) + poly1(x, ref_lambda, offset, slope)

def generic_chi2(params, data_vals, model, x, ref_lambda):
    model_vals = model(x, ref_lambda, *params)
    return np.sum((data_vals - model_vals)**2)

def cost_func(data_vals, model, x, ref_lambda):
    return partial(generic_chi2, data_vals=data_vals, model=model, x=x, ref_lambda=ref_lambda)

### Function to overplot the model on the data

In [None]:
def plotModels(cutout_wl, cutout_flux, ref_lambda, init_pars):
    
    model_vals = model_func(cutout_wl, ref_lambda, *init_pars)
    background_vals = poly1(cutout_wl, ref_lambda, init_pars[3], init_pars[4])

    fig, ax = plt.subplots(figsize=(8, 5))

    ax.plot(cutout_wl, cutout_flux, label="data")
    ax.plot(cutout_wl, background_vals, label="background model")
    ax.plot(cutout_wl, model_vals, label="model")

    ax.set_xlabel(r'$\lambda [nm]$')
    ax.set_ylabel(r'Flux [arcane units]')

    ax.legend(fontsize=10)
    fig.tight_layout()

    plt.show()

### Function to define the cost function, fit the data, and plot the result

In [None]:
def fitAndPlotResult(cutout_wl, cutout_flux, ref_lambda, init_pars):

    our_cost_func = cost_func(cutout_flux, model_func, cutout_wl, ref_lambda=ref_lambda)
    result = optimize.minimize(our_cost_func, x0=np.array(init_pars))
    fit_pars = result['x']
    model_fit = model_func(cutout_wl, ref_lambda, *fit_pars)
    background_fit = poly1(cutout_wl, ref_lambda, fit_pars[3], fit_pars[4])

    print("Best Fit ---------")
    print(f"       Line Intensity : {fit_pars[0]:0.1f} [arcane units]")
    print(f"            Line Peak : {fit_pars[1]:0.4f} [nm]")
    print(f"           Line Width : {fit_pars[2]:0.4f} [nm]")
    print(f" Background at 500 nm : {fit_pars[3]:0.2f} [arcane units]")
    print(f"     Background slope : {fit_pars[4]:0.2f} [arcane_units / nm]")

    fig, ax = plt.subplots(figsize=(8, 5))

    ax.plot(cutout_wl, cutout_flux, label="data")
    ax.plot(cutout_wl, background_fit, label="background model")
    ax.plot(cutout_wl, model_fit, label="full model")

    ax.set_xlabel(r"$\lambda [nm]$")
    ax.set_ylabel("Flux [arcane units]")

    ax.legend(fontsize=10)
    fig.tight_layout()

    plt.show()

# Lets have a look at the complete spectrum

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

ax.plot(wavelength, flux)
ax.set_xlabel(r"$\lambda [nm]$")
ax.set_ylabel("Flux [arcane units]")

ax.set_title("A spectrum from SDSS")

fig.tight_layout()

plt.show()

# Now let's select a cutout of the data and have a look at that

In [None]:
cutout_wl, cutout_flux = selectData(wavelength, flux, 480, 515)

fig, ax = plt.subplots(figsize=(8, 5))

ax.plot(cutout_wl, cutout_flux)

ax.set_xlabel(r"$\lambda [nm]$")
ax.set_ylabel("Flux [arcane units]")

fig.tight_layout()
plt.show()

# Setting the reference wavelenght (i.e., the reference point for the background line)

Because we want to be able to look at other peaks as well, and because we learned that if we want to fit a line it is smart to pick a reference x-axis value in the middle of the data, I've changed the model function a tiny bit, to 
let you set the reference point, if you look at the cell that defines the model you can see that the polynomical is 
`offset + (x-ref_lambda)*slope` instead of `offset + (x-500)*slope` as it was in the original notebook.



In [None]:
ref_lambda = 500

# Setting the intial guess parameters

Here we're just making some manual guesses. One of the main objectives of this project is to write an algorithm that determines these initial guesses based solely on the raw data shown above for a single peak.

In [None]:
prefact_0 = 235.
mu_0 = 505.8
sigma_0 = 0.25
offset_0 = 60
slope_0 = 0

init_pars = (prefact_0, mu_0, sigma_0, offset_0, slope_0)

# Plotting the initial guess model and the data

In [None]:
plotModels(cutout_wl, cutout_flux, ref_lambda, init_pars)

# Fitting the data and replotting the result

In [None]:
fitAndPlotResult(cutout_wl, cutout_flux, ref_lambda, init_pars)

# A function that might be useful: `np.argmax`

In [None]:
print("     The index of the largest bin is : ", np.argmax(cutout_flux))
print("The flux value in the largest bin is : ", cutout_flux[np.argmax(cutout_flux)])