In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas
import numpy as np
import datetime
import seaborn as sns
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
def pred_bin_out(row, nBins, binTimeRes):
    """
    Given the prediction label, get the actual
    output in bins by converting the label into
    binary representation. For ex, label 2 would
    convert to 10 and 5 to 101 and so on.
    """
    # Note we need the binary format to be consistent
    # with the actual labels, i.e., it depends on the 
    # number of bins. For example, 2 could be 10 or 010.
    binFormtStr = '{0:0' + str(nBins) + 'b}'
    predBinStr = binFormtStr.format(row["pred_label"])
    # Now add these into different pred bins
    for _n, _pb in enumerate(predBinStr):
        row["pbin_" + str(_n)] = int(_pb)
    if row["label"] == 0:
        if row["pred_label"] == 0:
            predType = "TN"
        else:
            predType = "FP"
    if row["label"] == 1:
        if row["pred_label"] == 1:
            predType = "TP"
        else:
            predType = "FN"
    row["pred_type"] = predType
    return row

In [None]:
colNames = ["date"]
nBins = 1
binRes = 90
for _nb in range(nBins):
    colNames += [ "bin_" + str(_nb) ]
colNames += ["label", "del_minutes","pred_label"]
for _nb in range(nBins):
    # there are 2 probs for each bin
    # one zero prob and other 1 prob
    colNames += [ "prob_type_0_b_" + str(_nb) ]
    colNames += [ "prob_type_1_b_" + str(_nb) ]
predDF = pandas.read_csv("../data/resnet_test_data_pred.csv", names=colNames,\
                     header=0, parse_dates=["date"])
predDF = predDF.apply( pred_bin_out, args=(nBins,binRes,),\
                      axis=1 )
predDF.head()

In [None]:
start_date = predDF["date"].min() - datetime.timedelta(hours=2)
end_date = predDF["date"].max()
print start_date, end_date

In [None]:
omn_dbdir = "../data/sqlite3/"
omn_db_name = "omni_sw_imf.sqlite"
omn_table_name = "imf_sw"
timeDelHours = 0.5

In [None]:
# read omni data
conn = sqlite3.connect(omn_dbdir + omn_db_name,
                       detect_types = sqlite3.PARSE_DECLTYPES)
# load data to a dataframe
command = "SELECT datetime, Bz, Vx, By FROM {tb} WHERE datetime BETWEEN '{stm}' and '{etm}'"
command = command.format(tb=omn_table_name,\
                         stm=start_date, etm=end_date)
omnDF = pandas.read_sql(command, conn)
# drop nan's
omnDF.dropna(inplace=True)
# Change the index to datetime
omnDF.set_index(omnDF["datetime"], inplace=True)
omnDF.head()

In [None]:
# get the dates from the sel type
# and store the results in a dict!
delTimeList = [ 2 ]
predOmnPrfDFList = []
for _pt in predDF["pred_type"].unique():
    selDFDates = predDF[ predDF["pred_type"] == _pt ]["date"].tolist()
    # Now we need the time history at each of these dates
    for _dtl in delTimeList:
        for _cd in selDFDates:
            _ed =_cd - datetime.timedelta(minutes=int(_dtl*60))
            _nd = _cd + datetime.timedelta(minutes=int(_dtl*20))
            _resOmnDF = omnDF[ _ed : _nd ]
            _resOmnDF["delTimeOnset"] = (_resOmnDF["datetime"]\
                                         - _cd).astype('timedelta64[m]')
            _resOmnDF["pred_type"] = _pt
            _resOmnDF["pred_date"] = _cd
            _resOmnDF = _resOmnDF[["Bz", "Vx", "By",\
                            "delTimeOnset", "pred_date", "pred_type"]]
            
            _resOmnDF["theta_c"] = np.round(np.arctan2(_resOmnDF["By"],\
                                            _resOmnDF["Bz"]), 2) % (2*np.pi)
            _resOmnDF["B_T"] = np.sqrt(np.square(_resOmnDF["By"]) + np.square(_resOmnDF["Bz"]))
            _resOmnDF["newell"] =  (_resOmnDF["Vx"]**(4./3)) * (_resOmnDF["B_T"] ** (2./3)) * (np.sin(_resOmnDF["theta_c"] / 2.))**(8./3)
            
            _resOmnDF.reset_index(inplace=True, drop=True)
            predOmnPrfDFList.append( _resOmnDF )
            
predOmnPrflDF = pandas.concat(predOmnPrfDFList)
predOmnPrflDF.head()

In [None]:
predOmnPrflDF.to_csv("../data/omn_cplng_profile_cat.csv")

In [None]:
meanParamDF = predOmnPrflDF[\
                    ["Bz", "Vx", "By", "delTimeOnset", "pred_type"]\
                    ].groupby( ["delTimeOnset", "pred_type"] ).mean().reset_index()
meanParamDF.columns = ["delTimeOnset", "pred_type", "mean_Bz", "mean_Vx", "mean_By"]
stdParamDF = predOmnPrflDF[\
                    ["Bz", "Vx", "By", "delTimeOnset", "pred_type"]\
                    ].groupby( ["delTimeOnset", "pred_type"] ).std().reset_index()
stdParamDF.columns = ["delTimeOnset", "pred_type", "std_Bz", "std_Vx", "std_By"]
meanParamDF = pandas.merge( meanParamDF, stdParamDF, on=["delTimeOnset", "pred_type"] )
meanParamDF.head()

In [None]:
plt.style.use("fivethirtyeight")
predTypeList = [ "TP", "FP", "FN", "TN" ]
f = plt.figure(figsize=(12, 8))
ax = f.add_subplot(1,1,1)

for _pd in predTypeList:
    selDF = meanParamDF[ meanParamDF["pred_type"] == _pd ]
    ax.scatter( selDF["delTimeOnset"].values, selDF["mean_Bz"].values, label=_pd )
    ax.errorbar( selDF["delTimeOnset"].values, selDF["mean_Bz"].values,\
               yerr=selDF["std_Bz"].values, label='', capthick=2., capsize=5., fmt='o')
plt.legend()
f.savefig("../plots/epoch_pred_types_Bz_median.pdf")
f.savefig("../plots/epoch_pred_types_Bz_median.eps")

In [None]:
plt.style.use("fivethirtyeight")
predTypeList = [ "TP", "FP", "FN", "TN" ]
f = plt.figure(figsize=(12, 8))
ax = f.add_subplot(1,1,1)

for _pd in predTypeList:
    selDF = meanParamDF[ meanParamDF["pred_type"] == _pd ]
    ax.scatter( selDF["delTimeOnset"].values, selDF["mean_Vx"].values, label=_pd )
#     ax.errorbar( selDF["delTimeOnset"].values, selDF["mean_Vx"].values,\
#                yerr=selDF["std_Vx"].values, label='', capthick=2., capsize=5., fmt='o')
plt.legend()
f.savefig("../plots/epoch_pred_types_Vx_median.pdf")
f.savefig("../plots/epoch_pred_types_Vx_median.eps")

In [None]:
plt.style.use("fivethirtyeight")
predTypeList = [ "TP", "FP", "FN", "TN" ]
f = plt.figure(figsize=(12, 8))
ax = f.add_subplot(1,1,1)

for _pd in predTypeList:
    selDF = meanParamDF[ meanParamDF["pred_type"] == _pd ]
    ax.scatter( selDF["delTimeOnset"].values, selDF["mean_By"].values, label=_pd )
#     ax.errorbar( selDF["delTimeOnset"].values, selDF["mean_By"].values,\
#                yerr=selDF["std_By"].values, label='', capthick=2., capsize=5., fmt='o')
plt.legend()
f.savefig("../plots/epoch_pred_types_By_median.pdf")
f.savefig("../plots/epoch_pred_types_By_median.eps")

In [None]:
# Bin by delTBins
delTBins = range(-120,10,5)
# get the min al in the next 30 min
oldColNames = predOmnPrflDF.columns.tolist()
predDF2 = pandas.concat( [ predOmnPrflDF, \
                    pandas.cut( predOmnPrflDF["delTimeOnset"], \
                               bins=delTBins ) ], axis=1 )
predDF2.columns = oldColNames + ["delT_bin"]
predDF2 = predDF2[ predDF2["pred_type"].isin([ "TP", "FP", "FN", "TN"]) ]

In [None]:
plt.style.use("fivethirtyeight")
predTypeList = [ "TP", "FP", "FN", "TN" ]
f = plt.figure(figsize=(12, 8))
ax = f.add_subplot(1,1,1)

sns.boxplot(x="delT_bin", y="Bz", hue="pred_type", hue_order=predTypeList,\
                  data=predDF2, showfliers=False,ax=ax, linewidth=0.,\
            notch=True, width=0.5)

ax.set_ylim([-5,2])
plt.xticks(rotation=45)
f.savefig("../plots/epoch_pred_types_Bz_boxplot.pdf")
f.savefig("../plots/epoch_pred_types_Bz_boxplot.eps")

In [None]:
# omn_dbdir = "../data/sqlite3/"
# omn_db_name = "smu_sml_sme.sqlite"
# omn_table_name = "smusmlsme"
# conn = sqlite3.connect(omn_dbdir + omn_db_name,
#                        detect_types = sqlite3.PARSE_DECLTYPES)
# # load data to a dataframe
# command = "SELECT datetime, al, ae, au FROM {tb} WHERE datetime BETWEEN '{stm}' and '{etm}'"
# command = command.format(tb=omn_table_name,\
#                          stm=start_date, etm=end_date)
# smlDF = pandas.read_sql(command, conn)
# # drop nan's
# smlDF.dropna(inplace=True)
# smlDF.set_index(smlDF["datetime"], inplace=True)
# smlDF.head()

In [None]:
# def get_sml_vars(row):
#     """
#     Get mean, median, std, min and max of sml 
#     during various substorms over the next interval range.
#     """
#     delTimeList = [30, 60]#[ 15, 30, 60, 120 ]
#     for _dtl in delTimeList:
#         _pd = row["pred_date"] - datetime.timedelta(minutes=10)
#         _cd = row["pred_date"] + datetime.timedelta(minutes=1)
#         _ed = row["pred_date"] + datetime.timedelta(minutes=_dtl)
#         _resDF = smlDF[ _cd : _ed ]
#         _baselineAl = smlDF[ _pd : _cd ]["al"].median()
#         _baselineAe = smlDF[ _pd : _cd ]["ae"].median()
#         row["mean_al_" + str(_dtl)] = _resDF["al"].mean()
#         row["median_al_" + str(_dtl)] = _resDF["al"].median()
#         row["min_al_" + str(_dtl)] = _resDF["al"].min()
#         row["max_al_" + str(_dtl)] = _resDF["al"].max()
#         # difference between current AL and minimum in the next bin
#         # note this is defined to be negative, for easy binning etc
#         row["al_dip" + str(_dtl)] = _resDF["al"].min() - _baselineAl
#         row["ae_dip" + str(_dtl)] = _resDF["ae"].max() - _baselineAe
#     return row

# predDF2 = predDF2.apply( get_sml_vars, axis=1 )
# predDF2.head()

In [None]:
# predOmnPrflDF.to_csv("../data/omn_sml_profile_cat.csv")