In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as ss
from math import floor
from matplotlib import colors
from fitter import Fitter, get_common_distributions, get_distributions

In [61]:
%matplotlib qt
rng = np.random.default_rng(100)

SMALL_SIZE = 20
MEDIUM_SIZE = 24
BIGGER_SIZE = 32
CHONK_SIZE = 38
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : SMALL_SIZE}
plt.rc('font', **font)
plt.rc('axes', titlesize=BIGGER_SIZE, labelsize=MEDIUM_SIZE, facecolor="xkcd:white")
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=CHONK_SIZE, facecolor="xkcd:white", edgecolor="xkcd:black") #  powder blue


path_missing = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/tidy_data.csv"
dfm_full = pd.read_csv(path_missing, index_col=None, sep=",")
dfm_full.index += 1
dfm = dfm_full.copy(deep=True)
drop_cols = [x for x in range(10, 30)] + [46, 48]
dfm = dfm.drop(dfm.columns[drop_cols].values.tolist(), axis=1)
dfm.columns = ['CAPE', 'CIN', 'LCL', 'LFC', 'EL', 'LI', 'HGHT0C',
            'CAP', 'B3KM', 'BRN', 'SHEAR 0-1 KM', 'SHEAR 0-6 KM',
            'EFF INFLOW', 'EBWD', 'SRH 0-1 KM', 'SRH 0-3 KM', 'EFF SRH', 'SCP',
            'STP-FIXED', 'STP-MIXED', 'SHIP', 'PWAT', 'DCAPE', 'MLMR', 'LRAT',
            'TEI', 'TLCL', 'T500', 'SWEAT', 'K-INDEX', 'CRAV', 'HAIL SIZE IN']


path_complete = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"
df_full = pd.read_csv(path_complete, index_col=0)
df_full.index += 1
df = df_full.copy(deep=True)
drop_cols = [x for x in range(11, 31)] + [47, 49]
df = df.drop(df.columns[drop_cols].values.tolist(), axis=1)
df.columns = ['CAPE', 'CIN', 'LCL', 'LFC', 'EL', 'LI', 'HGHT0C',
            'CAP', 'B3KM', 'BRN', 'SHEAR 0-1 KM', 'SHEAR 0-6 KM',
            'EFF INFLOW', 'EBWD', 'SRH 0-1 KM', 'SRH 0-3 KM', 'EFF SRH', 'SCP',
            'STP-FIXED', 'STP-MIXED', 'SHIP', 'PWAT', 'DCAPE', 'MLMR', 'LRAT',
            'TEI', 'TLCL', 'T500', 'SWEAT', 'K-INDEX', 'CRAV', 'HAIL SIZE IN']

dfm
df

Unnamed: 0,CAPE,CIN,LCL,LFC,EL,LI,HGHT0C,CAP,B3KM,BRN,...,DCAPE,MLMR,LRAT,TEI,TLCL,T500,SWEAT,K-INDEX,CRAV,HAIL SIZE IN
1,565.886137,-2.456216,591.712340,760.740300,10016.261419,-2.475117,3057.724668,14.031755,164.527269,23.507197,...,455.116074,11.200228,6.069679,14.624760,16.369450,16.369450,237.167161,27.148344,11784.929088,1.25
2,93.557330,-61.118000,818.659297,1485.730600,4147.988929,1.094013,2878.872717,9.819021,43.581811,2.961764,...,310.547370,9.709781,5.573791,11.207180,14.814369,14.814369,196.419861,27.202330,1995.924762,1.00
3,416.713894,-0.701233,682.113493,751.489413,7419.731564,-2.174859,3043.083673,14.935360,145.276881,15.405312,...,625.998000,10.208610,6.338184,24.258178,15.830423,15.830423,195.164206,9.331257,8136.811509,1.00
4,1110.622796,-12.420499,536.926037,989.547800,11364.753475,-4.154931,3532.140768,18.580863,172.484246,22.193236,...,343.228844,12.461809,6.614233,19.432405,18.284842,17.999692,250.757864,21.326550,31959.336376,1.50
5,1107.162497,-12.514324,536.912773,1008.662600,11386.082876,-4.102513,3583.432806,18.936401,181.416062,23.363115,...,403.770627,12.579437,6.578394,21.032620,18.199308,18.092846,264.888229,20.604840,32653.287157,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29098,0.000000,0.000000,2343.569759,1107.279086,2343.569759,12.494209,3528.145294,11.552847,0.000000,0.000000,...,43.767507,3.629146,7.248768,25.792261,1.717536,7.683926,296.701531,17.164210,0.000000,1.25
29099,0.000000,0.000000,2326.323051,1107.279086,2326.323051,14.986276,3497.386732,11.339111,0.000000,0.000000,...,50.793697,3.101795,7.258029,27.763669,0.607784,7.955932,303.557106,19.452570,0.000000,1.00
29100,0.000000,0.000000,2690.384769,2630.432840,2690.384769,14.638317,3482.939445,16.114445,0.000000,0.000000,...,61.023765,2.921537,7.280055,25.590055,-0.462974,4.940292,232.337003,11.883790,0.000000,1.00
29101,0.000000,0.000000,2807.261593,2441.488395,2807.261593,16.123160,3451.467575,12.735316,0.000000,0.000000,...,74.905532,2.476587,7.248819,25.573522,-0.424774,1.702041,161.945441,6.005967,0.000000,0.88


In [75]:
# Plot the mean and standard deviation of each column for both datasets

fig, [ax1, ax2] = plt.subplots(nrows=2)
sns.scatterplot(data=df, x=np.sqrt(dfm.mean()), y = np.sqrt(df.mean()), ax=ax1, color="xkcd:red", s=50)
sns.scatterplot(data=df, x=np.sqrt(dfm.std()), y = np.sqrt(df.std()), ax=ax2, color="xkcd:red", s=50)
ax1_xlim = ax1.get_xlim()
ax1_ylim = ax1.get_ylim()
ax2_xlim = ax2.get_xlim()
ax2_ylim = ax2.get_ylim()

# set axis ticks to be the same
ax1.set_xticks(np.arange(0, ax1_xlim[1], 20))
ax2.set_xticks(np.arange(0, ax2_xlim[1], 20))


# draw a line with slope of 1 through ax1 and ax2
sns.lineplot(x = np.linspace(ax1_xlim[0], ax1_xlim[1], 100), y = np.linspace(ax1_ylim[0], ax1_ylim[1], 100), ax=ax1)
sns.lineplot(x = np.linspace(ax2_xlim[0], ax2_xlim[1], 100), y = np.linspace(ax2_ylim[0], ax2_ylim[1], 100), ax=ax2)

ax1.set_xlabel("Missing Data")
ax1.set_ylabel("KNN Imputed Data")
ax2.set_xlabel("Missing Data")
ax2.set_ylabel("KNN Imputed Data")
ax1.set_title("Root Transformed Mean")
ax2.set_title("Root Transformed Standard Deviation")
ax1.text(-1.5, 48, s="T500", fontsize=20)
ax2.text(-2, 40, s="T500", fontsize=20)
# for line in range(0,df.shape[1]):
#      ax1.text(dfm.mean()[line]+0.2, df.mean()[line], s=df.columns[line], horizontalalignment='left', size='medium', color='black', weight='semibold')



plt.show()


  result = getattr(ufunc, method)(*inputs, **kwargs)
