## Plot Ideas
-------------

* QQ Normal Plot to see if Hailstone Sizes are normally distributed
    * If not normal, then find a distribution that does fit it (maybe something log normal?)
    * Generate plot that demonstrates this

* Histograms for some of the variables, especially Hailstone Sizes and maybe heatmaps with some other ones?

* For all duplicates, see how far apart the actual variables are; worth using three times as much information for little benefit?
    * Do this maybe with... stacked histograms / line plot / something else? 
    * Calculate mean of duplicate variables, is that a better indicator, or should use closest to mean variable?

* Correlation matrix for the data, make it real pretty like, consider whether we neeeeed all these variables or can PCA/SVM/LASSO to reduce dimensionality

* Scale data maybe? 

* Boxplots to see about spread and central tendency, maybe even two dimensional versions or facet grid

In [94]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
from math import floor
from matplotlib import colors
from fitter import Fitter, get_common_distributions, get_distributions

In [56]:
%matplotlib qt
rng = np.random.default_rng(100)
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : "20"}

path = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"
df = pd.read_csv(path, index_col=0)
df.index += 1

In [93]:
# QQ Plots for Hail Size (Find the right distribution)

plt.clf()

samples = df["Hailstone Size"].shape[0]

normal = rng.standard_normal(size=samples)

s=1
x = np.linspace(ss.rayleigh.ppf(0.01, s),
                ss.rayleigh.ppf(0.99, s), 1000)
plt.plot(x, ss.rayleigh.pdf(x, s),
       'r-', lw=5, alpha=0.6, label='lognorm pdf')

plt.show()


In [102]:
# Fitter to find best distribution 
hail = df["Hailstone Size"].to_numpy()
f = Fitter(hail, 
           distributions=["gennorm", "dgamma", "dweibull", "cauchy"])


distrs = ['gamma','lognorm', "beta", "burr", "norm"]

f.fit()

f.summary()

# f.get_best(method = "sumsquare_error")





Unnamed: 0,sumsquare_error,aic,bic,kl_div
cauchy,40.289411,833.865428,-191542.6,inf
dweibull,inf,inf,inf,inf
gennorm,inf,inf,inf,inf
dgamma,inf,inf,inf,inf


In [None]:

# sumsquare_error	aic	bic	kl_div
# gennorm	22.303990	1061.334522	-208740.992995	inf
# dgamma	27.932897	1090.954331	-202191.893056	inf
# dweibull	38.602452	774.316469	-192777.084374	inf
# cauchy	40.289411	833.865428	-191542.586505	inf
# foldcauchy	40.686649	778.275503	-191246.778989	

In [59]:
### Histogram and QQ Norm for Hail Size

step = 0.25
breaks = [i for i in np.arange(floor(df["Hailstone Size"].min()), df["Hailstone Size"].max() + step, step)]

normal = rng.standard_normal(size=df["Hailstone Size"].shape[0])

fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False)
fig.patch.set_facecolor("xkcd:powder blue")
sns.histplot(data=df, x="Hailstone Size", discrete=True, bins=breaks, ax=ax1)
sm.qqplot(data=df["Hailstone Size"], line="s", ax=ax2)
ax1.legend(["Flirst"])
ax2.legend(["Stuff", "More"])
ax1.set_title("First")
ax2.set_title("Second")
plt.tight_layout()
plt.show()

In [53]:
### Corr plot for ten duplicates using pcolormesh
plt.clf()
plt.rc("font", **font)

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, ax_lst = plt.subplots(nrows=5, ncols=2, figsize=(10,14))
fig.suptitle("Correlation Plots for 3 Methods of Calculating 10 Meteorological Metrics", size=20)
fig.patch.set_facecolor("xkcd:light grey")

cnt = 0
for r, layer in enumerate(ax_lst):
    for c, ax in enumerate(layer):
        correlations = df.iloc[:, [cnt, cnt+10, cnt+20]].corr()
        axis_labels=correlations.columns.values.tolist()
        im = ax_lst[r, c].pcolormesh(correlations, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
        ticks = [i+0.5 for i in range(len(axis_labels))]
        ax.set_xticks(ticks)
        ax.invert_yaxis()
        ax_lst[r, c].set_xticks(ticks)
        ax_lst[r, c].set_xticklabels(axis_labels, fontsize=14)
        ax_lst[r, c].set_yticks(ticks)
        ax_lst[r, c].set_yticklabels(axis_labels, fontsize=14)
        ax_lst[r, c].grid(which='minor', color='b', linestyle='-', linewidth=2)
        cnt+=1

shrink_amount = 1.065
fig.colorbar(im, ax=ax_lst[:, 0], shrink=shrink_amount) # options are pad, shrink, aspect
fig.colorbar(im, ax=ax_lst[:, 1], shrink=shrink_amount)

cb1, cb2 = fig.axes[-2], fig.axes[-1]

cb1.tick_params(labelsize=14)
cb2.tick_params(labelsize=14)

plt.subplots_adjust(
    top=0.905,
    bottom=0.085,
    left=0.14,
    right=0.825,
    hspace=0.6,
    wspace=0.62
)

plt.show()

In [58]:
### Corr plot overall

plt.clf()

desired = [i for i in range(20,46)] + [i for i in range(48, 54)]

fig, ax1 = plt.subplots()
df_corr = df.iloc[desired, desired].corr()

sns.heatmap(data=df_corr, vmin=-1, vmax=1, ax=ax1,  xticklabels=1, yticklabels=1)

plt.tight_layout()