In [None]:
import numpy as np
import h5py
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from scipy import signal
from glob import glob
from peakfinder import detect_peaks 
from math import floor, ceil
from functions import *
from itertools import combinations
from tqdm import tqdm
np.set_printoptions(suppress=False)

In [None]:
wav, data, soundings, scaled_soundings, shift_std, alt = load_soundings()
std, fourier, grad, coh, polyres, optc, mv_av_res, poly_coeffs = create_features(wav, soundings)
# features[sounding][feature]
num_name = {0: "std", 1: "fourier_fit_residual", 2: "fourier_dominant_amplitude", 3: "fourier_dominant_period", 
            4: "grad_300_400", 5: "grad_350_400", 6: "grad_300_350", 7: "grad_300_500", 8: "shift_std",
            9: "coherence_max_min", 10: "polyfi_residual_0", 11: "polyfit_residual_1", 12: "polyfit_residual_2", 
            13: "polyfit_residual_3", 14: "optimum_counter", 15: "moving_average_residual"}
name_num = {v: k for k, v in num_name.items()}

In [None]:
fig = plt.figure(figsize=(15,9))
plt.title("Some vertical profiles");
j = 128

for i in range(28):
    ax = fig.add_subplot(7,4,i+1)
    ax.plot(data[i+j][:,127], alt[i+j])
    ax.grid()

# Mean

In [None]:
mean = np.reshape(soundings.mean(axis=1), (-1,1))
plt.hist(mean, bins=100);

# Standard Deviation

In [None]:
plt.hist(std, bins=100);

# Fourier features
Fourier transform the sounding, extract largest amplitude (excluding that corresponding to zero frequency),
filter the sounding such that n_freq frequencies remain, calculate mse between filtered signal and original signal.

In [None]:
fig = plt.figure(figsize=(20,6))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)

ax1.hist(fourier[:,0], bins=100)
ax1.set_title("Fourier fit residual")
ax2.hist(fourier[:,1], bins=100)
ax2.set_title("Fourier dominant amplitude")
ax3.hist(fourier[:,2], bins=100)
ax3.set_title("Fourier dominant period");

# Total change

In [None]:
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_subplot(1,4,1)
ax2 = fig.add_subplot(1,4,2)
ax3 = fig.add_subplot(1,4,3)
ax4 = fig.add_subplot(1,4,4)

ax1.hist(grad_300_400, bins=100)
ax2.hist(grad_300_350, bins=100)
ax3.hist(grad_350_400, bins=100)
ax4.hist(grad_300_500, bins=100);

# Shift uniformity
Did a sounding increase/decrease by a uniform amount at all wavelengths relative to the previous sounding?

In [None]:
plt.hist(shift_std, bins=100)
plt.title(
    "Histogram of the standard deviation of the change in \ntransmission at every wavelength from one sounding to the next"
);

# Phase coherence analysis
https://www.jstor.org/stable/pdf/3546310.pdf?refreqid=excelsior%3Aab23964ed57763344ab0eab4b0cd4e16

In [None]:
k = 812
coh_fun = np.zeros(soundings[k].shape[0]-1)
for i in range(soundings[k].shape[0]-1):
    coh_fun[i] = soundings[k][::i+1].std()

In [None]:
#std[step:start]
plt.plot(coh_fun);

In [None]:
soundings.shape

In [None]:
coh = np.zeros((soundings.shape[0], 1))
for i in range(soundings.shape[0]):
    coh[i] = phase_coherence(soundings[i])

In [None]:
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
ax.hist(coh, bins=100)
ax.set_title("Difference between minimum and maximum of coherence function")

# Polyfit residuals

In [None]:
coeffs = np.polyfit(wav, soundings[800], 2)
fit = np.poly1d(coeffs)(wav)
plt.plot(wav, fit)
plt.plot(wav, soundings[800])

In [None]:
polyres2 = np.zeros((soundings.shape[0], 1))
polyres3 = np.zeros((soundings.shape[0], 1))
for i in range(soundings.shape[0]):
    polyres2[i] = polyfit_residual(soundings[i], 2) 
    polyres3[i] = polyfit_residual(soundings[i], 3)

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
ax1.hist(polyres2, bins=100)
ax1.set_title("Residual between original and polynomial approximation of degree 2")
ax2.hist(polyres3, bins=100)
ax2.set_title("Residual between original and polynomial approximation of degree 3");

# Optimum counter
Compress array to smooth noise and take gradient, then count x axis intersects as a measure of wavelength

In [None]:
fig = plt.figure(figsize=(15,3))
ax = fig.add_subplot(1,3,1)
ax.plot(soundings[800])
ax.set_title("Original")

ax = fig.add_subplot(1,3,2)
ax.plot(moving_average(soundings[800], 10))
ax.set_title("Compressed")

ax = fig.add_subplot(1,3,3)
ax.plot(np.gradient(compress(soundings[800], 32)))
ax.grid()
ax.set_title("Gradient of compressed");

print("Number of x axis intersections: ", intersect_counter(np.gradient(compress(soundings[800], 32))))

In [None]:
optc = np.zeros((soundings.shape[0], 1))
for i in range(soundings.shape[0]):
    optc[i] = optimum_counter(soundings[i], 30)

In [None]:
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
ax.hist(optc, bins=100)
ax.set_title("Number of optima after taking moving average");

# Moving average
Use a moving average to smooth noise and calculate gradient.

In [None]:
plt.plot(soundings[809][1:-2])
plt.plot(moving_average(soundings[809], 4))

In [None]:
av1 = moving_average(soundings[809], 4)
av2 = moving_average(soundings[809], 8)
av3 = moving_average(soundings[809], 16)
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_subplot(1,4,1)
ax2 = fig.add_subplot(1,4,2)
ax3 = fig.add_subplot(1,4,3)
ax4 = fig.add_subplot(1,4,4)
ax1.plot(soundings[809])
ax2.plot(av1)
ax3.plot(av2)
ax4.plot(av3)

In [None]:
plt.plot(np.gradient(av3))
plt.title("gradient")
plt.grid();
print("Number of optima: ", intersect_counter(np.gradient(av3)))

In [None]:
mv_av_res = np.zeros((soundings.shape[0], 1))
for i in range(soundings.shape[0]):
    mv_av_res[i] = moving_average_residual(soundings[i], 30)

In [None]:
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(1,1,1)
ax.hist(mv_av_res, bins=100)
ax.set_title("Residual between original and moving average");

# Peak finder

In [None]:
detect_peaks(moving_average(soundings[0], 10), mph=soundings[0].mean(), mpd=20, show=True, xdata=moving_average(wav, 10))

In [None]:
detect_peaks(moving_average(soundings[0], 10), mph=soundings[0].mean(), mpd=20, show=True, xdata=moving_average(wav, 10), valley=True)

# Polynomial coefficients

In [None]:
np.polyfit(wav, soundings.T, 4).T.shape

In [None]:
scatter_features(features)

In [None]:
retrieve_soundings(["moving_average_residual", 0.1, 0.15]).shape

In [None]:
view_soundings(retrieve_soundings(["moving_average_residual", 0.1, 0.15]))