## Plot Ideas
-------------

* QQ Normal Plot to see if Hailstone Sizes are normally distributed
    * If not normal, then find a distribution that does fit it (maybe something log normal?)
    * Generate plot that demonstrates this

* Histograms for some of the variables, especially Hailstone Sizes and maybe heatmaps with some other ones?

* For all duplicates, see how far apart the actual variables are; worth using three times as much information for little benefit?
    * Do this maybe with... stacked histograms / line plot / something else? 
    * Calculate mean of duplicate variables, is that a better indicator, or should use closest to mean variable?

* Correlation matrix for the data, make it real pretty like, consider whether we neeeeed all these variables or can PCA/SVM/LASSO to reduce dimensionality

* Scale data maybe? 

* Boxplots to see about spread and central tendency, maybe even two dimensional versions or facet grid

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
from math import floor
from matplotlib import colors
from fitter import Fitter, get_common_distributions, get_distributions

In [107]:
%matplotlib qt
rng = np.random.default_rng(100)

SMALL_SIZE = 14
MEDIUM_SIZE = 18
BIGGER_SIZE = 26
CHONK_SIZE = 32
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : SMALL_SIZE}
plt.rc('font', **font)
plt.rc('axes', titlesize=BIGGER_SIZE, labelsize=MEDIUM_SIZE, facecolor="xkcd:light grey")
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=CHONK_SIZE, facecolor="xkcd:ice blue", edgecolor="xkcd:black") #  powder blue

# set the font globally
# plt.rcParams.update({'font.family':'DIN Condensed', "font"})
path = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"
df = pd.read_csv(path, index_col=0)
df.index += 1

In [66]:
# Fitter to find best distribution 
hail = df["Hailstone Size"].to_numpy()

distributions_to_check = ["gennorm", "dgamma", "dweibull", "cauchy"]
f = Fitter(hail, distributions=distributions_to_check)

f.fit()
print(f.summary())
print("\nWe will use the distribution with the lowest sum of squares error, the generalised normal distribution.")
print(f.get_best(method = "sumsquare_error"))

          sumsquare_error          aic            bic  kl_div
gennorm         22.303990  1061.334522 -208740.992995     inf
dgamma          27.932897  1090.954331 -202191.893056     inf
dweibull        38.602452   774.316469 -192777.084374     inf
cauchy          40.289411   833.865428 -191542.586505     inf

We will use the distribution with the lowest sum of squares error, the generalised normal distribution.
{'gennorm': {'beta': 0.47774409138777574, 'loc': 1.0, 'scale': 0.028076214758935215}}


In [67]:
# Plotting qq gennorm distribution for hailstone size
# SSE = 22.30399, which is the lowest sum of squares error of all the distributions tested.


# sumsquare_error	aic	bic	kl_div
# gennorm	22.303990	1061.334522	-208740.992995	inf
# dgamma	27.932897	1090.954331	-202191.893056	inf
# dweibull	38.602452	774.316469	-192777.084374	inf
# cauchy	40.289411	833.865428	-191542.586505	inf
# foldcauchy	40.686649	778.275503	-191246.778989	


plt.clf()

fig, ax = plt.subplots()
ss.probplot(df["Hailstone Size"], sparams=(0.47774409138777574, 1.0, 0.47774409138777574), dist='gennorm', fit=True, plot=ax, rvalue=False)
plt.show()


In [106]:
### Standard and Log Histograms of Hailstone Size
plt.clf()


fig, [ax1, ax2] = plt.subplots(ncols=2)
step = 0.25
breaks = np.arange(floor(df["Hailstone Size"].min() - step), df["Hailstone Size"].max() + step, step)
labs = np.arange(0, df["Hailstone Size"].max() + .5, .5)
sns.histplot(data=df, x="Hailstone Size", discrete=False, bins=breaks, ax=ax1)
ax1.set_xticks(labs)
ax1.set_xticklabels(labs)
ax1.set_title("Histogram of Hailstone Sizes")
ax1.set_xlabel("Hailstone Size in Inches")
ax1.set_ylabel("Frequency")
sns.histplot(data=df, x="Hailstone Size", discrete=False, bins=breaks, ax=ax2)
ax2.set_yscale("log")
ax2.set_xticks(labs)
ax2.set_xticklabels(labs)
ax2.set_ylim(bottom=0)
ax2.set_title("Histogram of Hailstone Sizes (Log Scale)")
ax2.set_xlabel("Hailstone Size in Inches")
ax2.set_ylabel("Log(Frequency)")



# sns.displot(data=df, x="Hailstone Size", kind="kde")
# ax = plt.gca()
# fig = plt.gcf()
# fig.patch.set_facecolor("xkcd:powder blue")
# ax.legend(["Flirst"])
# ax.set_title("First")
# plt.tight_layout()
# plt.show()

Invalid limit will be ignored.
  ax2.set_ylim(bottom=0)


Text(0, 0.5, 'Log(Frequency)')

In [104]:
### Corr plot for ten duplicates using pcolormesh
plt.clf()

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, ax_lst = plt.subplots(nrows=5, ncols=2, figsize=(10,14))
fig.suptitle("Correlation Plots for 3 Methods of Calculating 10 Meteorological Metrics")
# fig.patch.set_facecolor("xkcd:light grey")

cnt = 0
for r, layer in enumerate(ax_lst):
    for c, ax in enumerate(layer):
        correlations = df.iloc[:, [cnt, cnt+10, cnt+20]].corr()
        axis_labels=correlations.columns.values.tolist()
        im = ax_lst[r, c].pcolormesh(correlations, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
        ticks = [i+0.5 for i in range(len(axis_labels))]
        ax.set_xticks(ticks)
        ax.invert_yaxis()
        ax_lst[r, c].set_xticks(ticks)
        ax_lst[r, c].set_xticklabels(axis_labels)
        ax_lst[r, c].set_yticks(ticks)
        ax_lst[r, c].set_yticklabels(axis_labels)
        ax_lst[r, c].grid(which='minor', color='b', linestyle='-', linewidth=2)
        cnt+=1

shrink_amount = 1.065
fig.colorbar(im, ax=ax_lst[:, 0], shrink=shrink_amount) # options are pad, shrink, aspect
fig.colorbar(im, ax=ax_lst[:, 1], shrink=shrink_amount)

cb1, cb2 = fig.axes[-2], fig.axes[-1]


plt.subplots_adjust(
    top=0.905,
    bottom=0.085,
    left=0.14,
    right=0.825,
    hspace=0.6,
    wspace=0.62
)

plt.show()



In [58]:
### Corr plot overall

plt.clf()

desired = [i for i in range(20,46)] + [i for i in range(48, 54)]

fig, ax1 = plt.subplots()
df_corr = df.iloc[desired, desired].corr()

sns.heatmap(data=df_corr, vmin=-1, vmax=1, ax=ax1,  xticklabels=1, yticklabels=1)

plt.tight_layout()