## Data Preparation Notebook

## Imports

In [2]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from botocore.client import Config
from ibm_watson_studio_pipelines import WSPipelines
import matplotlib.pyplot as plt

# import itc_utils.flight_service as itcfs

import heapq
import pickle
import os, types
import pandas as pd
import ibm_boto3

### Load the validated training and test data from IBM Filesystem (alternatively Db2) 

```

```

## Pipeline Params

In [3]:
TOKEN = os.getenv("USER_ACCESS_TOKEN")

In [4]:
train_data_path = os.getenv("train_data_path")
test_data_path = os.getenv("test_data_path")

In [5]:
pipeline_path = os.getenv("pipeline_path")
if pipeline_path is None:
    pipeline_path = "feature_encode.pickle"

In [None]:
train_data_path = "mlops-dir/train_gcr.csv"
test_data_path = "mlops-dir/test_gcr.csv"

###  Read and Write Utility

In [6]:
def save_data_in_filesystem(df,filename):
    """
    Save Data in Filesystem

    Passed filename should involve path

    """
    try:
        if filename[-3:] == "csv":
            df.to_csv(filename,index=False)
            print(f"File {filename} persisted successfully as csv")
        else:
            with open(filename, 'wb') as f:
                pickle.dump(df, f)
            print(f"File {filename} pickled successfully")
    except Exception as e:
        print(e)
        print(f"File serialization for {filename} failed")

def check_for_file_in_filesystem(path):
    """
    Check existence of path in filesystem
    """
    if os.path.exists(path):
        return True
    else:
        return False
    
def read_data_from_db2(data_request):
    read_client = itcfs.get_flight_client()
    DB2_DATA_data_request = {
        'connection_name': """DB2_DATA""",
        'interaction_properties': {
            'select_statement': 'SELECT * FROM "CUSTOMER_DATA"."GERMAN_CREDIT_RISK_TRAINING" FETCH FIRST 5000 ROWS ONLY'
        }
    }

    flightInfo = itcfs.get_flight_info(read_client, nb_data_request=data_request)

    df = itcfs.read_pandas_and_concat(read_client, flightInfo, timeout=240)
    return df
    
def load_data_from_filesystem(path):
    """
    Check existence of path in filesystem.
    If it does exist, loads csv via path
    If it does NOT exist, try to load data from Db2
    """
    body = check_for_file_in_filesystem(path)
    if body:
        suffix = path[-3:]
        # Check whether path ends on csv
        if suffix == "csv":
            gcf_df = pd.read_csv(path)
        else:
            with open(path) as f:
                gcf_df = pickle.load(f)

        return gcf_df
    else:
        print("\n")
        print(f"{path} file/path is probably not in project. Loading File from MLOps COS Bucket.")

        data_request = {
                'connection_name': """DB2_DATA""",
                'interaction_properties': {
                    'select_statement': 'SELECT * FROM "CUSTOMER_DATA"."GERMAN_CREDIT_RISK_TRAINING" FETCH FIRST 5000 ROWS ONLY'
                }
            }

        gcf_df = read_data_from_db2(data_request)
        return gcf_df

## Train_Data 

In [7]:
train_data = load_data_from_filesystem(train_data_path)
train_data.head()

TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
object_df = train_data.select_dtypes('O')
object_df.head()

In [None]:
object_cols = list(set(object_df.columns.tolist()) - set(['Risk']))
object_cols

In [None]:
numerical_columns = [col for col in train_data.columns.tolist() if col not in object_cols and col!='Risk']

## Load Test Data 

In [None]:
test_data = load_data_from_filesystem(test_data_path)
test_data.head()

## Split X and Y 

In [None]:
y_train = train_data['Risk']
X_train = train_data.drop("Risk",axis=1)


y_test = test_data['Risk']
X_test = test_data.drop("Risk",axis=1)

## Categorcial Feature Analysis 

In [None]:
def prepare_input_data(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc


def prepare_output_data(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc


def select_best_chi2_features(X_train, y_train, X_test,score_func=chi2):
    featureselector = SelectKBest(score_func=chi2, k='all')
    featureselector.fit(X_train, y_train)
    X_train_best_feat = featureselector.transform(X_train)
    X_test_best_feat= featureselector.transform(X_test)
    return X_train_best_feat, X_test_best_feat, featureselector


def select_best_mutualinf_features(X_train, y_train, X_test,k=5):
    featureselector = SelectKBest(score_func=mutual_info_classif, k=k)
    featureselector.fit(X_train, y_train)
    X_train_best_feat = fs.transform(X_train)
    X_test_best_feat= fs.transform(X_test)
    return X_train_best_feat, X_test_best_feat, featureselector

# def plot_scores():
#     plt.figure(figsize=(14, 12))
#     plt.subplot(221)

#     ax1 = sns.barplot([i for i in range(len(fs.scores_))], fs.scores_)
#     ax1.set_title("Chi2 Importance Scores", fontsize=20)
#     ax1.set_xlabel("Features",fontsize=15)
#     ax1.set_ylabel("Chi2 Scores",fontsize=15)
    
    
def get_top_k_catgeorical(fs,train_cat,k=10):
    fs_score_map = {}
    for i in range(len(fs.scores_)):
        #print(f"Feature {train_cat.columns.tolist()[i]} {fs.scores_[i]}")
        fs_score_map[train_cat.columns.tolist()[i]] = fs.scores_[i]
        
    k_keys_sorted_by_values = heapq.nlargest(k, fs_score_map, key=fs_score_map.get)
    
    return k_keys_sorted_by_values

## Encode and shape the Variables 

In [None]:
X_train_enc, X_test_enc = prepare_input_data(X_train[object_cols], X_test[object_cols])

y_train_enc, y_test_enc = prepare_output_data(y_train, y_test)

X_train_fs, X_test_fs, fs = select_best_chi2_features(X_train_enc, y_train_enc, X_test_enc)

# plot_scores()


## Top K Categorical Features  based on Chi2

In [None]:
top_k_cat = get_top_k_catgeorical(fs,X_train[object_cols])
top_k_cat

## Top K Categorical Features  based on Mutual Information Feature Selection

In [None]:
X_train_enc_mf, X_test_enc_mf = prepare_input_data(X_train[object_cols], X_test[object_cols])

y_train_enc_mf, y_test_enc_mf = prepare_output_data(y_train, y_test)

X_train_fs_mf, X_test_fs_mf, fs_mf = select_best_chi2_features(X_train_enc_mf, y_train_enc_mf, X_test_enc_mf)

# plot_scores()

In [None]:
top_k_cat_mf = get_top_k_catgeorical(fs_mf,X_train[object_cols])
top_k_cat_mf

In [None]:
union_features = list(set(top_k_cat+top_k_cat_mf))
if "Sex" not in union_features:
    union_features.append("Sex")
union_features

## Filter the Top K Categorical features and Merge to Original Train and Test Dataframes

In [None]:
X_train_object_filtered = X_train[union_features]
X_test_object_filtered = X_test[union_features]

X_train_final = pd.concat([X_train[numerical_columns],X_train_object_filtered],axis=1)

X_test_final = pd.concat([X_test[numerical_columns],X_test_object_filtered],axis=1)

## Use Column Transformer and Pipelines to encode the Input and Output Variables . Scale the Numerical columns using MinMaxScaler.

In [None]:
numerical_ix = X_train_final.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X_train_final.select_dtypes(include=['object', 'bool']).columns

In [None]:
encoding_steps = [('cat', OrdinalEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=encoding_steps)

In [None]:
pipeline = Pipeline(steps=[('prep',col_transform)])

In [None]:
train_final = pd.concat([X_train_final,y_train],axis=1)

In [None]:
test_final = pd.concat([X_test_final,y_test],axis=1)

In [None]:
#encoded_train = pd.DataFrame(pipeline.fit_transform(X_train_final),columns=X_train_final.columns)

In [None]:
#encoded_test = pd.DataFrame(pipeline.transform(X_test_final),columns=X_test_final.columns)

## Save the Prepared Data to IBM COS

In [None]:
save_data_in_filesystem(df=train_final, filename=train_data_path)

In [None]:
save_data_in_filesystem(df=test_final, filename=test_data_path)

In [None]:
save_data_in_filesystem(df=pipeline, filename=pipeline_path)

## Check if files have been copied 

In [None]:
data_prep_done = check_for_file_in_filesystem(train_data_path) and check_for_file_in_filesystem(test_data_path) and check_for_file_in_filesystem(pipeline_path)
data_prep_done

## Store Params in WS Pipelines

In [None]:
preparation_params = {}
preparation_params['data_prep_done'] = data_prep_done
preparation_params['pipeline_path'] = pipeline_path

pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(preparation_params)