# Training Data Validation

In [1]:
import pandas as pd
import utils

from constants import *

In [2]:
df_train = pd.read_csv(utils.resolve_path(DATA_DIR, TRAIN_FILE),
                       index_col=DATE_COL,
                       parse_dates=True)
df_train.head()

Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 07:42:00,6630.097,5739.308,-15373.126,-35.407,15.722,4.999,0.85,0.94,0.386,18120.328019,...,5817703.0,-49.256716,-26.708257,2.337419,56.080465,48820450.0,0.633532,2,2,0
2011-03-24 07:42:01,6630.372,5739.723,-15372.852,-35.976,14.327,3.401,0.606,0.811,0.729,18120.320315,...,5817703.0,-49.256716,-26.708257,2.337456,56.080467,48820450.0,0.633532,0,2,0
2011-03-24 07:42:02,6630.645,5740.138,-15372.58,-34.115,17.308,3.078,0.434,0.666,0.367,18120.313647,...,5817703.0,-49.256716,-26.708257,2.337478,56.080468,48820450.0,0.633532,0,2,0
2011-03-24 07:42:03,6630.92,5740.553,-15372.306,-34.83,17.01,4.846,1.054,1.083,0.505,18120.305979,...,5817703.0,-49.256716,-26.708257,2.337449,56.080466,48820450.0,0.633532,0,2,0
2011-03-24 07:42:04,6631.195,5740.968,-15372.032,-37.102,14.965,0.64,0.478,0.981,2.181,18120.298328,...,5817703.0,-49.256716,-26.708257,2.33742,56.080465,48820450.0,0.633532,0,2,0


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 99126933 entries, 2011-03-24 07:42:00 to 2015-04-30 06:45:32
Data columns (total 30 columns):
 #   Column        Dtype  
---  ------        -----  
 0   X_MSO         float64
 1   Y_MSO         float64
 2   Z_MSO         float64
 3   BX_MSO        float64
 4   BY_MSO        float64
 5   BZ_MSO        float64
 6   DBX_MSO       float64
 7   DBY_MSO       float64
 8   DBZ_MSO       float64
 9   RHO_DIPOLE    float64
 10  PHI_DIPOLE    float64
 11  THETA_DIPOLE  float64
 12  BABS_DIPOLE   float64
 13  BX_DIPOLE     float64
 14  BY_DIPOLE     float64
 15  BZ_DIPOLE     float64
 16  RHO           float64
 17  RXY           float64
 18  X             float64
 19  Y             float64
 20  Z             float64
 21  VX            float64
 22  VY            float64
 23  VZ            float64
 24  VABS          float64
 25  D             float64
 26  COSALPHA      float64
 27  EXTREMA       int64  
 28  ORBIT         int64  
 29  LABEL      

In [4]:
orbit_ids = df_train[ORBIT_COL].unique()
print(f"#orbits: {len(orbit_ids)}")

#orbits: 3153


At first glance, there appears to be a high deviation in orbit length.

In [5]:
sizes = df_train.groupby(ORBIT_COL).size()
sizes.describe()

count     3153.000000
mean     31438.925785
std       5314.769120
min      27937.000000
25%      28806.000000
50%      28809.000000
75%      29572.000000
max      43466.000000
dtype: float64

The reason for this is that MESSENGER changed from a 12-hour orbit to 8-hour orbit in April 2012.

In [6]:
drop_idx = sizes.diff().abs().idxmax()
drop_day = df_train.index[df_train[ORBIT_COL] == drop_idx][0].strftime("%Y-%m-%d")
print(f"orbit lengths dropped starting with orbit #{drop_idx} on {drop_day}")
sizes.loc[drop_idx-5:drop_idx+5]

orbit lengths dropped starting with orbit #790 on 2012-04-16


ORBIT
785    41778
786    41777
787    41778
788    41777
790    32687
791    32685
792    32686
793    32686
794    32686
795    32686
dtype: int64

When taking this into account, the initial deviation disappears.

In [7]:
sizes.loc[:drop_idx-1].describe()

count      566.000000
mean     42762.409894
std        558.423890
min      41777.000000
25%      42432.000000
50%      42486.000000
75%      43204.000000
max      43466.000000
dtype: float64

In [8]:
sizes.loc[drop_idx:].describe()

count     2587.000000
mean     28961.503286
std        396.403636
min      27937.000000
25%      28805.000000
50%      28808.000000
75%      28813.000000
max      32687.000000
dtype: float64

Verify that no NaN values exist.

In [9]:
nan_count = df_train.isnull().sum().sum()
print(f"There are {nan_count} NaN values.")

There are 0 NaN values.
