In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Test Train Split
## Bridge Split
A 80:20 train/test split is applied to the data. This is achieved by performing a train_test split on the `bridge` using `train_test_split` from `sklearn`. The data is shuffled because the raw records are sorted. The other tables will be split by separating those tables using the unique IDs from the split `bridge`.

In [2]:
bridge_df = pd.read_csv('../data/raw/bridge.csv')
bridge_train, bridge_test = train_test_split(bridge_df, test_size=0.2, shuffle=True, random_state=42)

# index sorted for easier confirmation of filtering
bridge_train = bridge_train.sort_index()
bridge_test = bridge_test.sort_index()

It is confirmed that the split was successful by comparing the combination of test and train data with the original pre-split data.

In [3]:
# the combined data index need to be sorted because the dataframes must be aligned to use compare
# empty DataFrame result indicates identical DataFrames
bridge_train.append(bridge_test).sort_index().compare(bridge_df).shape

(0, 0)

In [4]:
# index=False and fillna('NA') are used to maintain the same structure as the raw data
bridge_train.fillna('NA').to_csv('../data/processed/bridge_train.csv', index=False)
bridge_test.fillna('NA').to_csv('../data/processed/bridge_test.csv', index=False)

## Claims Split
`dim_claim_id` is used to separate the `claims` test and train data.

In [5]:
claims_df = pd.read_csv('../data/raw/dim_claims.csv')
claims_train = claims_df.loc[claims_df['dim_claim_id'].isin(bridge_train['dim_claim_id'])]
claims_test = claims_df.loc[claims_df['dim_claim_id'].isin(bridge_test['dim_claim_id'])]

The unique IDs present in the `bridge` test and train splits identical to those in the `claims` test and train splits.

In [6]:
# compare the dim_claim_id values of the bridge and claims dataframes
print(np.all(bridge_test['dim_claim_id'].values == claims_test['dim_claim_id'].values))
print(np.all(bridge_train['dim_claim_id'].values == claims_train['dim_claim_id'].values))

True
True


In [7]:
# index=False and fillna('NA') are used to maintain the same structure as the raw data
claims_train.fillna('NA').to_csv('../data/processed/dim_claims_train.csv', index=False)
claims_test.fillna('NA').to_csv('../data/processed/dim_claims_test.csv', index=False)

## PA Split
`dim_pa_id` is used to separate the `pa` test and train data.

In [8]:
pa_df = pd.read_csv('../data/raw/dim_pa.csv')
pa_train = pa_df.loc[pa_df['dim_pa_id'].isin(bridge_train['dim_pa_id'])]
pa_test = pa_df.loc[pa_df['dim_pa_id'].isin(bridge_test['dim_pa_id'])]

# compare the dim_pa_id values of the bridge and pa dataframes
# dropna is required for the bridge dataframes because nan is 
# provided if the claim does not have a pa
print(np.all(bridge_test['dim_pa_id'].dropna().values == pa_test['dim_pa_id'].values))
print(np.all(bridge_train['dim_pa_id'].dropna().values == pa_train['dim_pa_id'].values))

True
True


In [9]:
# index=False and fillna('NA') are used to maintain the same structure as the raw data
pa_train.fillna('NA').to_csv('../data/processed/dim_pa_train.csv', index=False)
pa_test.fillna('NA').to_csv('../data/processed/dim_pa_test.csv', index=False)

## Date Split
`dim_date_id` is used to separate the `date` test and train data.

In [10]:
date_df = pd.read_csv('../data/raw/dim_date.csv')
date_train = date_df.loc[date_df['dim_date_id'].isin(bridge_train['dim_date_id'])]
date_test = date_df.loc[date_df['dim_date_id'].isin(bridge_test['dim_date_id'])]

# compare the dim_date_id values of the bridge and pa dataframes
# unique is required for the bridge dataframes because multiple claims
# contain the same date id
print(np.all(bridge_test['dim_date_id'].unique() == date_test['dim_date_id'].values))
print(np.all(bridge_train['dim_date_id'].unique() == date_train['dim_date_id'].values))

True
True


In [11]:
# index=False and fillna('NA') are used to maintain the same structure as the raw data
date_train.fillna('NA').to_csv('../data/processed/dim_date_train.csv', index=False)
date_test.fillna('NA').to_csv('../data/processed/dim_date_test.csv', index=False)