In [113]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
from math import floor
from matplotlib import colors
from fitter import Fitter, get_common_distributions, get_distributions

In [114]:
%matplotlib qt
rng = np.random.default_rng(100)

SMALL_SIZE = 20
MEDIUM_SIZE = 24
BIGGER_SIZE = 32
CHONK_SIZE = 38
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : SMALL_SIZE}
plt.rc('font', **font)
plt.rc('axes', titlesize=BIGGER_SIZE, labelsize=MEDIUM_SIZE, facecolor="xkcd:white")
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=CHONK_SIZE, facecolor="xkcd:white", edgecolor="xkcd:black") #  powder blue

drop_lst = ["MU CAPE", "MU CIN", "MU LCL", "MU LFC", "MU EL", "MU LI", "MU hght0c", "MU cap", "MU b3km", "MU brn", "SB CAPE", "SB CIN", "SB LCL", "SB LFC", "SB EL", "SB LI", "SB hght0c",
"SB cap", "SB b3km", "SB brn", "sb_tlcl", "mu_tlcl"]

col_names = ['CAPE', 'CIN', 'LCL', 'LFC', 'EL', 'LI', 'HGHT0C',
            'CAP', 'B3KM', 'BRN', 'SHEAR 0-1 KM', 'SHEAR 0-6 KM',
            'EFF INFLOW', 'EBWD', 'SRH 0-1 KM', 'SRH 0-3 KM', 'EFF SRH', 'SCP',
            'STP-FIXED', 'STP-MIXED', 'SHIP', 'PWAT', 'DCAPE', 'MLMR', 'LRAT',
            'TEI', 'TLCL', 'T500', 'SWEAT', 'K-INDEX', 'CRAV', 'HAIL SIZE IN']

path_missing = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/tidy_data.csv"
dfm_full = pd.read_csv(path_missing, index_col=None, sep=",")
dfm_full.index += 1
dfm = dfm_full.copy(deep=True)
dfm = dfm.drop(columns = drop_lst)
dfm.columns = col_names

path_complete = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"
df_full = pd.read_csv(path_complete, index_col=0)
df_full.index += 1
df = df_full.copy(deep=True)
df = df.drop(columns = drop_lst)
df.columns = col_names

dfm
df

Unnamed: 0,CAPE,CIN,LCL,LFC,EL,LI,HGHT0C,CAP,B3KM,BRN,...,DCAPE,MLMR,LRAT,TEI,TLCL,T500,SWEAT,K-INDEX,CRAV,HAIL SIZE IN
1,565.886137,-2.456216,591.712340,760.740300,10016.261419,-2.475117,3057.724668,14.031755,164.527269,23.507197,...,455.116074,11.200228,6.069679,14.624760,14.650496,-15.9184,237.167161,27.148344,11784.929088,1.25
2,93.557330,-61.118000,818.659297,1485.730600,4147.988929,1.094013,2878.872717,9.819021,43.581811,2.961764,...,310.547370,9.709781,5.573791,11.207180,11.829009,-15.5371,196.419861,27.202330,1995.924762,1.00
3,416.713894,-0.701233,682.113493,751.489413,7419.731564,-2.174859,3043.083673,14.935360,145.276881,15.405312,...,625.998000,10.208610,6.338184,24.258178,13.206279,-16.9460,195.164206,9.331257,8136.811509,1.00
4,1110.622796,-12.420499,536.926037,989.547800,11364.753475,-4.154931,3532.140768,18.580863,172.484246,22.193236,...,343.228844,12.461809,6.614233,19.432405,16.262729,-15.3177,250.757864,21.326550,31959.336376,1.50
5,1107.162497,-12.514324,536.912773,1008.662600,11386.082876,-4.102513,3583.432806,18.936401,181.416062,23.363115,...,403.770627,12.579437,6.578394,21.032620,16.380479,-15.2050,264.888229,20.604840,32653.287157,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29098,0.000000,0.000000,2343.569759,1107.279086,2343.569759,12.494209,3528.145294,11.552847,0.000000,0.000000,...,43.767507,3.629146,7.248768,25.792261,-5.193819,-15.9886,296.701531,17.164210,0.000000,1.25
29099,0.000000,0.000000,2326.323051,1107.279086,2326.323051,14.986276,3497.386732,11.339111,0.000000,0.000000,...,50.793697,3.101795,7.258029,27.763669,-7.145827,-16.1519,303.557106,19.452570,0.000000,1.00
29100,0.000000,0.000000,2690.384769,2630.432840,2690.384769,14.638317,3482.939445,16.114445,0.000000,0.000000,...,61.023765,2.921537,7.280055,25.590055,-9.417577,-16.2501,232.337003,11.883790,0.000000,1.00
29101,0.000000,0.000000,2807.261593,2441.488395,2807.261593,16.123160,3451.467575,12.735316,0.000000,0.000000,...,74.905532,2.476587,7.248819,25.573522,-11.976938,-16.5189,161.945441,6.005967,0.000000,0.88


In [126]:
# Analysis of Missing Data

print(dfm.corr().iloc[:, -1] - df.corr().iloc[:, -1])

# find any entry with missing values in dfm
(dfm.isna().sum())


CAPE            0.000000
CIN             0.000000
LCL             0.000000
LFC            -0.023023
EL              0.000000
LI              0.000000
HGHT0C          0.000000
CAP            -0.014235
B3KM            0.000000
BRN             0.000000
SHEAR 0-1 KM    0.000000
SHEAR 0-6 KM    0.000000
EFF INFLOW     -0.019890
EBWD            0.000000
SRH 0-1 KM      0.000000
SRH 0-3 KM      0.000000
EFF SRH         0.000000
SCP             0.000000
STP-FIXED       0.000000
STP-MIXED       0.000000
SHIP            0.000000
PWAT            0.000000
DCAPE           0.000000
MLMR            0.000000
LRAT            0.000000
TEI             0.000000
TLCL            0.000000
T500            0.000000
SWEAT           0.000000
K-INDEX         0.000000
CRAV            0.000000
HAIL SIZE IN    0.000000
Name: HAIL SIZE IN, dtype: float64


CAPE                0
CIN                 0
LCL                 0
LFC              2708
EL                  0
LI                  0
HGHT0C              0
CAP             16143
B3KM                0
BRN                 0
SHEAR 0-1 KM        0
SHEAR 0-6 KM        0
EFF INFLOW       2069
EBWD                0
SRH 0-1 KM          0
SRH 0-3 KM          0
EFF SRH             0
SCP                 0
STP-FIXED           0
STP-MIXED           0
SHIP                0
PWAT                0
DCAPE               0
MLMR                0
LRAT                0
TEI                 0
TLCL                0
T500                0
SWEAT               0
K-INDEX             0
CRAV                0
HAIL SIZE IN        0
dtype: int64

In [116]:
# Check out 0-1 km shear and t500

# dfm["SHEAR 0-1 KM"][dfm['SHEAR 0-1 KM'] != df["SHEAR 0-1 KM"]]

a = dfm.iloc[:1000, 11].to_list()
b = df.iloc[:1000, 11].to_list()
for i in range(len(a)):
    print(a[i], b[i])

40.4817997653 40.4817997653
41.469446593 41.469446593
37.9558016256 37.9558016256
55.9362133586 55.9362133586
57.3294050573 57.3294050573
47.7262686591 47.7262686591
42.7417461089 42.7417461089
47.8914251895 47.8914251895
45.3341932698 45.3341932698
47.5725146852 47.5725146852
53.0831471484 53.0831471484
52.1602109021 52.1602109021
57.2357747239 57.2357747239
66.9055653824 66.9055653824
64.0649983753 64.0649983753
65.1454018118 65.1454018118
63.6724818379 63.6724818379
59.0310575361 59.0310575361
56.9011814253 56.9011814253
57.5124810233 57.5124810233
55.760846951 55.760846951
54.0112559162 54.0112559162
60.9397891292 60.9397891292
57.8661902523 57.8661902523
56.8507785599 56.8507785599
56.4111172088 56.4111172088
58.3869549228 58.3869549228
54.1940096218 54.1940096218
56.0815626712 56.0815626712
54.3834852383 54.3834852383
57.6999505536 57.6999505536
60.0272877393 60.0272877393
62.9503387899 62.9503387899
60.011592658 60.011592658
61.9096447008 61.9096447008
62.0007389103 62.000738910

In [189]:
# Plot the mean and standard deviation of each column for both datasets

missing_cols = ["LFC", "CAP", "EFF INFLOW"]

fig, [ax1, ax2] = plt.subplots(nrows=2)
sns.scatterplot(data=df, x=np.sqrt(dfm[missing_cols].mean()), y = np.sqrt(df[missing_cols].mean()), ax=ax1, color="xkcd:red", s=50)
sns.scatterplot(data=df, x=np.sqrt(dfm[missing_cols].std()), y = np.sqrt(df[missing_cols].std()), ax=ax2, color="xkcd:red", s=50)
ax1_xlim = ax1.get_xlim()
ax1_ylim = ax1.get_ylim()
ax2_xlim = ax2.get_xlim()
ax2_ylim = ax2.get_ylim()

# set axis ticks to be the same
ax1.set_xticks(np.arange(0, ax1_xlim[1], 10))
ax2.set_xticks(np.arange(0, ax2_xlim[1], 10))
ax1.set_yticks(np.arange(0, ax1_ylim[1], 10))
ax2.set_yticks(np.arange(0, ax2_ylim[1], 10))

# draw a line with slope of 1 through ax1 and ax2
sns.lineplot(x = np.linspace(ax1_xlim[0], ax1_xlim[1], 100), y = np.linspace(ax1_ylim[0], ax1_ylim[1], 100), ax=ax1)
sns.lineplot(x = np.linspace(ax2_xlim[0], ax2_xlim[1], 100), y = np.linspace(ax2_ylim[0], ax2_ylim[1], 100), ax=ax2)

ax1.set_xlabel("Original Data")
ax1.set_ylabel("KNN Imputed Data")
ax2.set_xlabel("Original Data")
ax2.set_ylabel("KNN Imputed Data")
ax1.set_title("Root Transformed Means")
ax2.set_title("Root Transformed SDs")

plt.subplots_adjust(
top=0.915,
bottom=0.115,
left=0.19,
right=0.81,
hspace=0.6,
wspace=0.2)

# ax1.text(np.log(dfm.iloc[-5].mean()), np.log(df.iloc[-5].mean()),  s=df.columns[-5], fontsize=20)
opts = {"horizontalalignment":'left', "size":'medium', "color":'black', "weight":'semibold'}
mean = lambda dfx, col: np.sqrt(dfx[col].mean())
std = lambda dfx, col: np.sqrt(dfx[col].std())

ax1.text(mean(dfm, "CAP")-2.2, mean(df, "CAP")+3,  s="CAP", **opts)
ax2.text(std(dfm, "CAP")-1.4, std(df, "CAP")+2.3,  s="CAP", **opts)

ax1.text(mean(dfm, "EFF INFLOW")-9, mean(df, "EFF INFLOW")-7,  s="EFF INFLOW", **opts)
ax2.text(std(dfm, "EFF INFLOW")+1.5, std(df, "EFF INFLOW")-1.5,  s="EFF INFLOW", **opts)

ax1.text(mean(dfm, "LFC")-2.5, mean(df, "LFC")-7.5,  s="LFC", **opts)
ax2.text(std(dfm, "LFC")-1.8, std(df, "LFC")-5,  s="LFC", **opts)



plt.show()
