In [1]:
import pandas as pd
from os.path import join
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

## Determine an effective way to split the data into training and testing. 

In [2]:
def _train_test_split():
    """
    Randomly split the full dataset into training and testing 
    based on the date. 
    """
    basePath = '/work/mflora/ML_DATA/DATA'
    outPath = '/work/mflora/ML_DATA/MLDATA'
    
    for time in ['first_hour', 'second_hour']:
    
        path = join(basePath, f'wofs_ml_severe__{time}__data.feather')
        df = pd.read_feather(path)
    
        print(f'{df.shape=}')
    
        baseline_path = join(basePath, f'wofs_ml_severe__{time}__baseline_data.feather')
        baseline_df = pd.read_feather(baseline_path)
        
        # Get the date from April, May, and June 
        df['Run Date'] = df['Run Date'].apply(str)
        baseline_df['Run Date'] = baseline_df['Run Date'].apply(str)
        
        
        df = df[pd.to_datetime(df['Run Date']).dt.strftime('%B').isin(['April', 'May', 'June'])]
        baseline_df = baseline_df[
            pd.to_datetime(baseline_df['Run Date']).dt.strftime('%B').isin(['April', 'May', 'June'])]
        
        all_dates = list(df['Run Date'].unique())
        random.shuffle(all_dates)
        train_dates, test_dates = train_test_split(all_dates, test_size=0.3)
    
        train_df = df[df['Run Date'].isin(train_dates)] 
        test_df  = df[df['Run Date'].isin(test_dates)] 
    
        train_base_df = baseline_df[baseline_df['Run Date'].isin(train_dates)] 
        test_base_df  = baseline_df[baseline_df['Run Date'].isin(test_dates)] 
    
        print(f'{train_df.shape=}')
        print(f'{test_df.shape=}')
    
        train_df.reset_index(inplace=True, drop=True)
        test_df.reset_index(inplace=True, drop=True)
        
        train_base_df.reset_index(inplace=True, drop=True)
        test_base_df.reset_index(inplace=True, drop=True)
        
    
        train_df.to_feather(join(outPath, f'wofs_ml_severe__{time}__train_data.feather'))
        test_df.to_feather(join(outPath, f'wofs_ml_severe__{time}__test_data.feather'))
        
        
        train_base_df.to_feather(join(outPath, f'wofs_ml_severe__{time}__train_baseline_data.feather'))
        test_base_df.to_feather(join(outPath, f'wofs_ml_severe__{time}__test_baseline_data.feather'))

In [3]:
_train_test_split()

df.shape=(1013381, 190)
train_df.shape=(572792, 190)
test_df.shape=(251749, 190)
df.shape=(1430181, 190)
train_df.shape=(794449, 190)
test_df.shape=(370577, 190)


In [4]:
def get_features(df):
    """
    Get the feature columns from the DataFrame. 
    """
    ind = list(df.columns).index('hail_severe_3km')
    info = ['forecast_time_index', 'obj_centroid_x', 'obj_centroid_y', 'Run Date', 'label']
    non_target_vars = list(train_df.columns)[:ind]
    features = [f for f in non_target_vars if f not in info]
    return features