## Plot Ideas
-------------

* QQ Normal Plot to see if Hailstone Sizes are normally distributed
    * If not normal, then find a distribution that does fit it (maybe something log normal?)
    * Generate plot that demonstrates this

* Histograms for some of the variables, especially Hailstone Sizes and maybe heatmaps with some other ones?

* For all duplicates, see how far apart the actual variables are; worth using three times as much information for little benefit?
    * Do this maybe with... stacked histograms / line plot / something else? 
    * Calculate mean of duplicate variables, is that a better indicator, or should use closest to mean variable?

* Correlation matrix for the data, make it real pretty like, consider whether we neeeeed all these variables or can PCA/SVM/LASSO to reduce dimensionality

* Scale data maybe? 

* Boxplots to see about spread and central tendency, maybe even two dimensional versions or facet grid

In [50]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
from math import floor
from matplotlib import colors
from fitter import Fitter, get_common_distributions, get_distributions

In [75]:
%matplotlib qt
rng = np.random.default_rng(100)

SMALL_SIZE = 14
MEDIUM_SIZE = 18
BIGGER_SIZE = 26
CHONK_SIZE = 32
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : SMALL_SIZE}
plt.rc('font', **font)
plt.rc('axes', titlesize=BIGGER_SIZE, labelsize=MEDIUM_SIZE, facecolor="xkcd:white")
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=CHONK_SIZE, facecolor="xkcd:white", edgecolor="xkcd:black") #  powder blue

# set the font globally
# plt.rcParams.update({'font.family':'DIN Condensed', "font"})
path = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"
df_full = pd.read_csv(path)

df = df_full.copy(deep=True)


drop_cols = [0] + [x for x in range(11, 31)] + [47, 49]
df = df.drop(df.columns[drop_cols], axis=1)

df.columns = ['CAPE', 'CIN', 'LCL', 'LFC', 'EL', 'LI', 'HGHT0C',
            'CAP', 'B3KM', 'BRN', 'SHEAR 0-1 KM', 'SHEAR 0-6 KM',
            'EFF INFLOW', 'EBWD', 'SRH 0-1 KM', 'SRH 0-3 KM', 'EFF SRH', 'SCP',
            'STP-FIXED', 'STP-MIXED', 'SHIP', 'PWAT', 'DCAPE', 'MLMR', 'LRAT',
            'TEI', 'TLCL', 'T500', 'SWEAT', 'K-INDEX', 'CRAV', 'HAIL SIZE IN']

df

Unnamed: 0,CAPE,CIN,LCL,LFC,EL,LI,HGHT0C,CAP,B3KM,BRN,...,DCAPE,MLMR,LRAT,TEI,TLCL,T500,SWEAT,K-INDEX,CRAV,HAIL SIZE IN
1,565.886137,-2.456216,591.712340,760.740300,10016.261419,-2.475117,3057.724668,14.031755,164.527269,23.507197,...,455.116074,11.200228,6.069679,14.624760,14.650496,-15.9184,237.167161,27.148344,11784.929088,1.25
2,93.557330,-61.118000,818.659297,1485.730600,4147.988929,1.094013,2878.872717,9.819021,43.581811,2.961764,...,310.547370,9.709781,5.573791,11.207180,11.829009,-15.5371,196.419861,27.202330,1995.924762,1.00
3,416.713894,-0.701233,682.113493,751.489413,7419.731564,-2.174859,3043.083673,14.935360,145.276881,15.405312,...,625.998000,10.208610,6.338184,24.258178,13.206279,-16.9460,195.164206,9.331257,8136.811509,1.00
4,1110.622796,-12.420499,536.926037,989.547800,11364.753475,-4.154931,3532.140768,18.580863,172.484246,22.193236,...,343.228844,12.461809,6.614233,19.432405,16.262729,-15.3177,250.757864,21.326550,31959.336376,1.50
5,1107.162497,-12.514324,536.912773,1008.662600,11386.082876,-4.102513,3583.432806,18.936401,181.416062,23.363115,...,403.770627,12.579437,6.578394,21.032620,16.380479,-15.2050,264.888229,20.604840,32653.287157,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29098,0.000000,0.000000,2343.569759,1107.279086,2343.569759,12.494209,3528.145294,11.552847,0.000000,0.000000,...,43.767507,3.629146,7.248768,25.792261,-5.193819,-15.9886,296.701531,17.164210,0.000000,1.25
29099,0.000000,0.000000,2326.323051,1107.279086,2326.323051,14.986276,3497.386732,11.339111,0.000000,0.000000,...,50.793697,3.101795,7.258029,27.763669,-7.145827,-16.1519,303.557106,19.452570,0.000000,1.00
29100,0.000000,0.000000,2690.384769,2630.432840,2690.384769,14.638317,3482.939445,16.114445,0.000000,0.000000,...,61.023765,2.921537,7.280055,25.590055,-9.417577,-16.2501,232.337003,11.883790,0.000000,1.00
29101,0.000000,0.000000,2807.261593,2441.488395,2807.261593,16.123160,3451.467575,12.735316,0.000000,0.000000,...,74.905532,2.476587,7.248819,25.573522,-11.976938,-16.5189,161.945441,6.005967,0.000000,0.88


In [52]:
# Fitter to find best distribution 
hail = df_full["HAIL SIZE IN"].to_numpy()

distributions_to_check = ["gennorm", "dgamma", "dweibull", "cauchy"]
f = Fitter(hail, distributions=distributions_to_check)

f.fit()
print(f.summary())
print("\nWe will use the distribution with the lowest sum of squares error, the generalised normal distribution.")
print(f.get_best(method = "sumsquare_error"))

          sumsquare_error          aic            bic  kl_div
gennorm         22.303990  1061.334522 -208740.992995     inf
dgamma          27.932897  1090.954331 -202191.893056     inf
dweibull        38.602452   774.316469 -192777.084374     inf
cauchy          40.289411   833.865428 -191542.586505     inf

We will use the distribution with the lowest sum of squares error, the generalised normal distribution.
{'gennorm': {'beta': 0.47774409138777574, 'loc': 1.0, 'scale': 0.028076214758935215}}


In [53]:
# Plotting qq gennorm distribution for HAIL SIZE IN
# SSE = 22.30399, which is the lowest sum of squares error of all the distributions tested.


# sumsquare_error	aic	bic	kl_div
# gennorm	22.303990	1061.334522	-208740.992995	inf
# dgamma	27.932897	1090.954331	-202191.893056	inf
# dweibull	38.602452	774.316469	-192777.084374	inf
# cauchy	40.289411	833.865428	-191542.586505	inf
# foldcauchy	40.686649	778.275503	-191246.778989	


fig, ax = plt.subplots()
ss.probplot(df["HAIL SIZE IN"], sparams=(0.47774409138777574, 1.0, 0.47774409138777574), dist='gennorm', fit=True, plot=ax, rvalue=False)
plt.show()


In [54]:
### Standard and Log Histograms of HAIL SIZE IN
# plt.clf()

fig, [ax1, ax2] = plt.subplots(ncols=2)
step = 0.25
breaks = np.arange(floor(df["HAIL SIZE IN"].min() - step), df["HAIL SIZE IN"].max() + step, step)
labs = np.arange(0, df["HAIL SIZE IN"].max() + .5, .5)

sns.histplot(data=df, x="HAIL SIZE IN", discrete=False, bins=breaks, ax=ax1)
ax1.set_xticks(labs)
ax1.set_xticklabels(labs)
ax1.set_title("Histogram of Hailstone Sizes")
ax1.set_xlabel("HAIL SIZE IN")
ax1.set_ylabel("Frequency")

sns.histplot(data=df, x="HAIL SIZE IN", discrete=False, bins=breaks, ax=ax2)
ax2.set_yscale("log")
ax2.set_xticks(labs)
ax2.set_xticklabels(labs)
ax2.set_ylim(bottom=0)
ax2.set_title("Histogram of Hailstone Sizes (Log Scale)")
ax2.set_xlabel("HAIL SIZE IN")
ax2.set_ylabel("Log(Frequency)")

Invalid limit will be ignored.
  ax2.set_ylim(bottom=0)


Text(0, 0.5, 'Log(Frequency)')

In [62]:
### Corr plot for ten duplicates using pcolormesh
# plt.clf()

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, ax_lst = plt.subplots(nrows=5, ncols=2, figsize=(10,14))
fig.suptitle("Pairwise Correlations of 3 Methods for Calculating Meteorological Parameters")
# fig.patch.set_facecolor("xkcd:light grey")

cnt = 0
for r, layer in enumerate(ax_lst):
    for c, ax in enumerate(layer):
        correlations = df_full.iloc[:, [cnt, cnt+10, cnt+20]].corr()
        axis_labels=correlations.columns.values.tolist()
        im = ax_lst[r, c].pcolormesh(correlations, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
        ticks = [i+0.5 for i in range(len(axis_labels))]
        ax.set_xticks(ticks)
        ax.invert_yaxis()
        ax_lst[r, c].set_xticks(ticks)
        ax_lst[r, c].set_xticklabels(axis_labels)
        ax_lst[r, c].set_yticks(ticks)
        ax_lst[r, c].set_yticklabels(axis_labels)
        ax_lst[r, c].grid(which='minor', color='b', linestyle='-', linewidth=2)
        cnt+=1

shrink_amount = 1.065
fig.colorbar(im, ax=ax_lst[:, 0], shrink=shrink_amount) # options are pad, shrink, aspect
fig.colorbar(im, ax=ax_lst[:, 1], shrink=shrink_amount)

cb1, cb2 = fig.axes[-2], fig.axes[-1]


plt.subplots_adjust(
    top=0.905,
    bottom=0.085,
    left=0.14,
    right=0.825,
    hspace=0.6,
    wspace=0.62
)

plt.show()

In [72]:
### Corr plot for ten duplicates using pcolormesh vert FAILED
# plt.clf()

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, ax_lst = plt.subplots(nrows=10, figsize=(6,14))
fig.suptitle("Pairwise Correlations of 3 Methods for Calculating Meteorological Parameters")
# fig.patch.set_facecolor("xkcd:light grey")

cnt = 0
for r, ax in enumerate(ax_lst):
    # for c, ax in enumerate(layer):
    correlations = df.iloc[:, [cnt, cnt+10, cnt+20]].corr()
    axis_labels=correlations.columns.values.tolist()
    im = ax_lst[r].pcolormesh(correlations, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
    ticks = [i+0.5 for i in range(len(axis_labels))]
    ax.set_xticks(ticks)
    ax.invert_yaxis()
    ax_lst[r].set_xticks(ticks)
    ax_lst[r].set_xticklabels(axis_labels)
    ax_lst[r].set_yticks(ticks)
    ax_lst[r].set_yticklabels(axis_labels)
    ax_lst[r].grid(which='minor', color='b', linestyle='-', linewidth=2)
    cnt+=1

shrink_amount = 1.065
fig.colorbar(im, ax=ax_lst[:], shrink=shrink_amount) # options are pad, shrink, aspect
# fig.colorbar(im, ax=ax_lst[:, 1], shrink=shrink_amount)

cb1 = fig.axes[-1]


plt.subplots_adjust(
    top=0.88,
bottom=0.11,
left=0.155,
right=0.725,
hspace=0.415,
wspace=0.13
)

# plt.savefig("/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/reports/img/plots/corr_plots.png")

In [76]:
### CAPE vs Shear Scatter Plot

fig, ax = plt.subplots()
sns.scatterplot(data=df, x="CAPE", y="SHEAR 0-6 KM", ax=ax)
ax.set_xlabel("CAPE")
ax.set_ylabel("SHEAR 0-6 KM")

plt.show()

In [77]:
### Corr plot overall


fig, ax1 = plt.subplots()
df_corr = df.corr()

sns.heatmap(data=df_corr, vmin=-1, vmax=1, ax=ax1,  xticklabels=1, yticklabels=1)

plt.tight_layout()

In [78]:
### SHIP Plots

sep = 2

under_2in = df[df["HAIL SIZE IN"] <= sep]["HAIL SIZE IN"]
over_2in = df[df["HAIL SIZE IN"] > sep]["HAIL SIZE IN"]

fig, [ax1, ax2] = plt.subplots(ncols=2)

### ax1 ### 
[ax1.spines[x].set_visible(False) for x in ["top", "right", "left"]] # remove top, left, bottom axis border
ax1.yaxis.set_ticks_position("none") # remove y tick marks

dataset = [under_2in, over_2in]
labs = ["Under 2\"", "Over 2\""]
ax1.boxplot(dataset, labels = labs)

ax1.set_title("Using SHIP To Delineate Likely Hail Sizes")
ax1.set_ylabel("SHIP")

## ax2 ###
# plot matplotlib heatmap of HAIL SIZE IN vs SHIP on ax2
bin_size = 0.5
bins = np.arange(0, 6 + bin_size, bin_size)
freq_matrix_attrs = np.histogram2d(df["HAIL SIZE IN"], df["SHIP"], density=False, bins=bins)
freq_matrix_rot = freq_matrix_attrs[0] + 0.00001
freq_matrix = np.rot90(freq_matrix_rot).astype(float)
im = ax2.pcolormesh(freq_matrix, norm=colors.LogNorm(), cmap="magma", edgecolor="black", linewidth=0.5) # colors.Normalize(0, 1)
ax2.set_title("HAIL SIZE IN vs SHIP")
ax2.set_xlabel("HAIL SIZE IN")
ax2.set_ylabel("SHIP")
ax2.set_xticks(bins)
ax2.set_yticks(bins)
fig.colorbar(im, ax=ax2, shrink=1) # options are pad, shrink, aspect

plt.show()

In [59]:
# Testing

fig, ax = plt.subplots()
data = [[0.5, 2.5], [1.5, 1.5]]
bin_size = 1
bins = np.arange(0, 3 + bin_size, bin_size)
freq_matrix = np.histogram2d(data[0], data[1], bins=bins, range=(0, 3), density=True)[0]
freq_matrix = np.rot90(freq_matrix)
im = ax.pcolormesh(freq_matrix, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
ax.set_title("Testing")
ax.set_xlabel("Changing Values")
ax.set_ylabel("All 1's")
ax.set_xticks(bins)
ax.set_yticks(bins)
fig.colorbar(im, ax=ax, shrink=1) # options are pad, shrink, aspect
for y in range(freq_matrix.shape[0]):
    for x in range(freq_matrix.shape[1]):
        plt.text(x + 0.5, y + 0.5, '%.4f' % freq_matrix[y, x],
                 horizontalalignment='center',
                 verticalalignment='center',
                 color='white',
                 )


plt.show()

