## Plot Ideas
-------------

* QQ Normal Plot to see if Hailstone Sizes are normally distributed
    * If not normal, then find a distribution that does fit it (maybe something log normal?)
    * Generate plot that demonstrates this

* Histograms for some of the variables, especially Hailstone Sizes and maybe heatmaps with some other ones?

* For all duplicates, see how far apart the actual variables are; worth using three times as much information for little benefit?
    * Do this maybe with... stacked histograms / line plot / something else? 
    * Calculate mean of duplicate variables, is that a better indicator, or should use closest to mean variable?

* Correlation matrix for the data, make it real pretty like, consider whether we neeeeed all these variables or can PCA/SVM/LASSO to reduce dimensionality

* Scale data maybe? 

* Boxplots to see about spread and central tendency, maybe even two dimensional versions or facet grid

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from math import floor
import statsmodels.api as sm
from matplotlib import colors



In [2]:
%matplotlib qt
rng = np.random.default_rng(100)
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : "20"}

In [None]:
path = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"

df = pd.read_csv(path, index_col=0)
df.index += 1
df.iloc[:, [0, 10, 20]].corr()

In [None]:
### Histogram and QQ Norm for Hail Size

step = 0.25
breaks = [i for i in np.arange(floor(df["Hailstone Size"].min()), df["Hailstone Size"].max() + step, step)]

normal = rng.standard_normal(size=df["Hailstone Size"].shape[0])

fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False)
fig.patch.set_facecolor("xkcd:powder blue")
sns.histplot(data=df, x="Hailstone Size", discrete=True, bins=breaks, ax=ax1)
sm.qqplot(data=df["Hailstone Size"], line="45", ax=ax2)
ax1.legend(["Flirst"])
ax2.legend(["Stuff", "More"])
ax1.set_title("First")
ax2.set_title("Second")
plt.tight_layout()
plt.show()

In [34]:
### Corr plot for ten duplicates using imshow / matshow
plt.clf()
plt.rc("font", **font)

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, ax_lst = plt.subplots(nrows=5, ncols=2, figsize=(10,14))
fig.suptitle("Correlation Plots for 3 Methods of Calculating 10 Meteorological Metrics", size=20)
fig.patch.set_facecolor("xkcd:light grey")

cnt = 0
for r, layer in enumerate(ax_lst):
    for c, ax in enumerate(layer):
        correlations = df.iloc[:, [cnt, cnt+10, cnt+20]].corr()
        axis_labels=correlations.columns.values.tolist()
        im = ax_lst[r, c].matshow(correlations, cmap="gist_heat", norm=colors.Normalize(0, 1), **{"aspect": 0.3})
        ax_lst[r, c].xaxis.tick_bottom()
        ax_lst[r, c].set_xticks(range(len(axis_labels)))
        ax_lst[r, c].set_xticklabels(axis_labels, fontsize=14)
        ax_lst[r, c].set_yticks(range(len(axis_labels)))
        ax_lst[r, c].set_yticklabels(axis_labels, fontsize=14)
        cnt+=1

shrink_amount = .98
fig.colorbar(im, ax=ax_lst[:, 0], shrink=shrink_amount) # options are pad, shrink, aspect
fig.colorbar(im, ax=ax_lst[:, 1], shrink=shrink_amount)

cb1, cb2 = fig.axes[-2], fig.axes[-1]

cb1.tick_params(labelsize=14)
cb2.tick_params(labelsize=14)

plt.show()

In [53]:
### Corr plot for ten duplicates using pcolormesh
plt.clf()
plt.rc("font", **font)

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, ax_lst = plt.subplots(nrows=5, ncols=2, figsize=(10,14))
fig.suptitle("Correlation Plots for 3 Methods of Calculating 10 Meteorological Metrics", size=20)
fig.patch.set_facecolor("xkcd:light grey")

cnt = 0
for r, layer in enumerate(ax_lst):
    for c, ax in enumerate(layer):
        correlations = df.iloc[:, [cnt, cnt+10, cnt+20]].corr()
        axis_labels=correlations.columns.values.tolist()
        im = ax_lst[r, c].pcolormesh(correlations, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
        ticks = [i+0.5 for i in range(len(axis_labels))]
        ax.set_xticks(ticks)
        ax.invert_yaxis()
        ax_lst[r, c].set_xticks(ticks)
        ax_lst[r, c].set_xticklabels(axis_labels, fontsize=14)
        ax_lst[r, c].set_yticks(ticks)
        ax_lst[r, c].set_yticklabels(axis_labels, fontsize=14)
        ax_lst[r, c].grid(which='minor', color='b', linestyle='-', linewidth=2)
        cnt+=1

shrink_amount = 1.065
fig.colorbar(im, ax=ax_lst[:, 0], shrink=shrink_amount) # options are pad, shrink, aspect
fig.colorbar(im, ax=ax_lst[:, 1], shrink=shrink_amount)

cb1, cb2 = fig.axes[-2], fig.axes[-1]

cb1.tick_params(labelsize=14)
cb2.tick_params(labelsize=14)

plt.subplots_adjust(
    top=0.905,
    bottom=0.085,
    left=0.14,
    right=0.825,
    hspace=0.6,
    wspace=0.62
)

plt.show()

In [30]:
# matplotlib ax plot imshow reprex

plt.clf()
plt.rc("font", **font)

### group plots by variable; for each variable in the dictionary, generate and display corrplot of various calculation methods for it 
fig, (ax, cax) = plt.subplots(ncols=2, figsize=(6, 10), gridspec_kw={"width_ratios":[1, 0.1]})

correlations = df.iloc[:, [0, 10, 20]].corr()
axis_labels=correlations.columns.values.tolist()
im = ax.pcolormesh(correlations, norm=colors.Normalize(0, 1), cmap="magma", edgecolor="black", linewidth=0.5)
ticks = [i+0.5 for i in range(len(axis_labels))]
ax.set_xticks(ticks)
ax.set_xticklabels(axis_labels)
ax.set_yticks(ticks)
ax.set_yticklabels(axis_labels)
ax.invert_yaxis()
fig.colorbar(im, cax=cax, pad=0, shrink=0.5, aspect=20)

fig.suptitle("Correlation Plots for 3 Methods of Calculating 10 Meteorological Metrics", size=22)
fig.patch.set_facecolor("xkcd:light grey")
plt.show()

In [5]:
### Corr plot overall

plt.clf()

desired = [i for i in range(20,46)] + [i for i in range(48, 54)]

fig, ax1 = plt.subplots()
df_corr = df.corr().iloc[desired, desired]

sns.heatmap(data=df_corr, vmin=-1, vmax=1, ax=ax1,  xticklabels=1, yticklabels=1)

im = ax1.collections[0]

print(im)

plt.tight_layout()

<matplotlib.collections.QuadMesh object at 0x12c0f91e0>
