In [5]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from utils import read_yaml

LOAD_SPLIT_CONFIG_PATH = "../config/load_split_config.yaml"

def get_stratify_col(y, stratify_col):
    if stratify_col is None:
        stratification = None
    else:
        stratification = y[stratify_col]
    
    return stratification

def split_in_out_put(df,
                    target_column,
                    set_index = None):

    #rename the target name
    df = df.rename(columns={'default.payment.next.month': 'TARGET'})

    #create input(all of x) and output(all of y)
    output_df = df[target_column].reset_index(drop=True)
    input_df = df.drop([target_column], axis = 1)
    
    return output_df, input_df

def run_split_data(x, y,
                    stratify_col=None,
                    TEST_SIZE=0.2
                    ):
    
    strat_train = get_stratify_col(y, stratify_col)
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                       stratify = strat_train,
                                       test_size= TEST_SIZE*2,
                                       random_state= 42)
    
    strat_test = get_stratify_col(y_test, stratify_col)
    x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test,
                                       stratify = strat_test,
                                       test_size= 0.5,
                                       random_state= 42)

    return x_train, y_train, x_valid, y_valid, x_test, y_test

def main_read(params):
    df = pd.read_csv(params['file_loc'])
    output_df, input_df = split_in_out_put(df, target_column=params['target'], set_index='ID')
    x_train, y_train,x_valid, y_valid,x_test, y_test = run_split_data(input_df, output_df, 
                                                                      params['stratify'], 
                                                                      params['test_size'])

    joblib.dump(x_train, params["out_path"]+"x_train.pkl")
    joblib.dump(y_train, params["out_path"]+"y_train.pkl")
    joblib.dump(x_valid, params["out_path"]+"x_valid.pkl")
    joblib.dump(y_valid, params["out_path"]+"y_valid.pkl")
    joblib.dump(x_test, params["out_path"]+"x_test.pkl")
    joblib.dump(y_test, params["out_path"]+"y_test.pkl")

    return x_train, y_train, x_valid, y_valid, x_test, y_test

if __name__ == "__main__":
    params = read_yaml(LOAD_SPLIT_CONFIG_PATH)
    x_train, y_train, x_valid, y_valid, x_test, y_test = main_read(params)

ModuleNotFoundError: No module named 'utils'

In [5]:
PATH = 'data/UCI_Credit_Card.csv'
output_df, input_df = read_data(PATH, target_column='TARGET', set_index='ID')
x_train, y_train, x_valid, y_valid, x_test, y_test = run_split_data(input_df, output_df)

In [4]:
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)
print(x_test.shape)
print(y_test.shape)

(18000, 23)
(18000,)
(6000, 23)
(6000,)
(6000, 23)
(6000,)


In [22]:
from utils import read_yaml

ModuleNotFoundError: No module named 'utils'

In [8]:
utils?

Object `utils` not found.
