![Alt text](images/banner.png)

## Data Preparation Notebook

### Initial Setup

Some initial setup specific to running this notebook as part of the pipeline.

In [1]:
import os
#This environment variable is automatically set in WS Pipelines and are needed to access various services.
TOKEN = os.getenv("USER_ACCESS_TOKEN")

In [2]:
if os.getenv("running_in_production_pipeline"):
    running_in_production_pipeline = True
    # If you want to run additional steps when deploying to production like reporting to external services, you can use this variable to trigger that
    # It can also be used to skip steps that are only needed in development like plotting
    print("notebook is running in a production pipeline!")
else:
    running_in_production_pipeline = False

## Imports

In [3]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from botocore.client import Config
from ibm_watson_studio_pipelines import WSPipelines
import matplotlib.pyplot as plt
import heapq
import os
import pandas as pd

# Loading Variables and Utils from common python file
import vars_and_utils as vars_and_utils


## Preparing the Train Data 

In [4]:
train_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.train_data_path)
train_data.head()

Unnamed: 0,LoanDuration,LoanAmount,InstallmentPercent,CurrentResidenceDuration,Age,ExistingCreditsCount,Dependents,OthersOnLoan,EmploymentDuration,Housing,OwnsProperty,Job,CheckingStatus,LoanPurpose,CreditHistory,ExistingSavings,Telephone,Sex,Risk
0,18,462,2,2,37,2,1,none,1_to_4,own,savings_insurance,skilled,less_0,car_new,credits_paid_to_date,less_100,none,female,0
1,15,250,2,3,28,2,1,none,1_to_4,own,real_estate,skilled,less_0,furniture,prior_payments_delayed,less_100,yes,male,0
2,28,6235,3,3,57,2,1,none,greater_7,own,unknown,skilled,no_checking,education,prior_payments_delayed,500_to_1000,none,male,1
3,9,1032,3,4,41,1,1,none,4_to_7,own,savings_insurance,management_self-employed,no_checking,car_new,prior_payments_delayed,100_to_500,none,male,0
4,11,4553,3,3,22,1,1,none,less_1,own,savings_insurance,management_self-employed,0_to_200,car_new,credits_paid_to_date,less_100,none,female,0


In [5]:
object_df = train_data.select_dtypes('O')
object_df.head()

Unnamed: 0,OthersOnLoan,EmploymentDuration,Housing,OwnsProperty,Job,CheckingStatus,LoanPurpose,CreditHistory,ExistingSavings,Telephone,Sex
0,none,1_to_4,own,savings_insurance,skilled,less_0,car_new,credits_paid_to_date,less_100,none,female
1,none,1_to_4,own,real_estate,skilled,less_0,furniture,prior_payments_delayed,less_100,yes,male
2,none,greater_7,own,unknown,skilled,no_checking,education,prior_payments_delayed,500_to_1000,none,male
3,none,4_to_7,own,savings_insurance,management_self-employed,no_checking,car_new,prior_payments_delayed,100_to_500,none,male
4,none,less_1,own,savings_insurance,management_self-employed,0_to_200,car_new,credits_paid_to_date,less_100,none,female


In [6]:
object_cols = list(set(object_df.columns.tolist()) - set(['Risk']))
object_cols

['CheckingStatus',
 'ExistingSavings',
 'Telephone',
 'OthersOnLoan',
 'OwnsProperty',
 'Housing',
 'Sex',
 'Job',
 'LoanPurpose',
 'EmploymentDuration',
 'CreditHistory']

In [7]:
numerical_columns = [col for col in train_data.columns.tolist() if col not in object_cols and col!='Risk']

## Preparing the Test Data 

In [11]:
test_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.test_data_path)
test_data.head()

Unnamed: 0,LoanDuration,LoanAmount,InstallmentPercent,CurrentResidenceDuration,Age,ExistingCreditsCount,Dependents,OthersOnLoan,EmploymentDuration,Housing,OwnsProperty,Job,CheckingStatus,LoanPurpose,CreditHistory,ExistingSavings,Telephone,Sex,Risk
0,31,1889,3,3,32,1,1,none,less_1,own,savings_insurance,skilled,0_to_200,other,credits_paid_to_date,100_to_500,none,female,0
1,28,3693,3,2,32,1,1,none,greater_7,own,savings_insurance,skilled,0_to_200,retraining,credits_paid_to_date,less_100,none,male,0
2,32,9604,6,5,57,2,2,co-applicant,greater_7,free,unknown,skilled,no_checking,vacation,outstanding_credit,500_to_1000,yes,male,1
3,16,3109,3,1,36,2,1,none,4_to_7,own,car_other,skilled,less_0,vacation,credits_paid_to_date,less_100,none,female,0
4,15,250,3,2,24,2,2,none,4_to_7,own,savings_insurance,skilled,0_to_200,furniture,prior_payments_delayed,500_to_1000,yes,male,0


## Split the data sets  

In [12]:
y_train = train_data['Risk']
X_train = train_data.drop("Risk",axis=1)


y_test = test_data['Risk']
X_test = test_data.drop("Risk",axis=1)

## Categorcial Feature Analysis 

In [13]:
def prepare_input_data(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc


def prepare_output_data(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc


def select_best_chi2_features(X_train, y_train, X_test,score_func=chi2):
    featureselector = SelectKBest(score_func=chi2, k='all')
    featureselector.fit(X_train, y_train)
    X_train_best_feat = featureselector.transform(X_train)
    X_test_best_feat= featureselector.transform(X_test)
    return X_train_best_feat, X_test_best_feat, featureselector


def select_best_mutualinf_features(X_train, y_train, X_test,k=5):
    featureselector = SelectKBest(score_func=mutual_info_classif, k=k)
    featureselector.fit(X_train, y_train)
    X_train_best_feat = fs.transform(X_train)
    X_test_best_feat= fs.transform(X_test)
    return X_train_best_feat, X_test_best_feat, featureselector
    
    
def get_top_k_catgeorical(fs,train_cat,k=10):
    fs_score_map = {}
    for i in range(len(fs.scores_)):
        #print(f"Feature {train_cat.columns.tolist()[i]} {fs.scores_[i]}")
        fs_score_map[train_cat.columns.tolist()[i]] = fs.scores_[i]
        
    k_keys_sorted_by_values = heapq.nlargest(k, fs_score_map, key=fs_score_map.get)
    
    return k_keys_sorted_by_values

## Encode and shape the Variables 

In [14]:
X_train_enc, X_test_enc = prepare_input_data(X_train[object_cols], X_test[object_cols])

y_train_enc, y_test_enc = prepare_output_data(y_train, y_test)

X_train_fs, X_test_fs, fs = select_best_chi2_features(X_train_enc, y_train_enc, X_test_enc)



## Top K Categorical Features  based on Chi2

In [15]:
top_k_cat = get_top_k_catgeorical(fs,X_train[object_cols])
top_k_cat

['CreditHistory',
 'Telephone',
 'CheckingStatus',
 'Housing',
 'OthersOnLoan',
 'OwnsProperty',
 'LoanPurpose',
 'ExistingSavings',
 'EmploymentDuration',
 'Job']

## Top K Categorical Features  based on Mutual Information Feature Selection

In [16]:
X_train_enc_mf, X_test_enc_mf = prepare_input_data(X_train[object_cols], X_test[object_cols])

y_train_enc_mf, y_test_enc_mf = prepare_output_data(y_train, y_test)

X_train_fs_mf, X_test_fs_mf, fs_mf = select_best_chi2_features(X_train_enc_mf, y_train_enc_mf, X_test_enc_mf)

In [17]:
top_k_cat_mf = get_top_k_catgeorical(fs_mf,X_train[object_cols])
top_k_cat_mf

['CreditHistory',
 'Telephone',
 'CheckingStatus',
 'Housing',
 'OthersOnLoan',
 'OwnsProperty',
 'LoanPurpose',
 'ExistingSavings',
 'EmploymentDuration',
 'Job']

In [18]:
union_features = list(set(top_k_cat+top_k_cat_mf))
if "Sex" not in union_features:
    union_features.append("Sex")
union_features

['CheckingStatus',
 'ExistingSavings',
 'Telephone',
 'OthersOnLoan',
 'OwnsProperty',
 'Housing',
 'Job',
 'LoanPurpose',
 'EmploymentDuration',
 'CreditHistory',
 'Sex']

## Filter the Top K Categorical features and Merge to Original Train and Test Dataframes

In [19]:
X_train_object_filtered = X_train[union_features]
X_test_object_filtered = X_test[union_features]

X_train_final = pd.concat([X_train[numerical_columns],X_train_object_filtered],axis=1)

X_test_final = pd.concat([X_test[numerical_columns],X_test_object_filtered],axis=1)

## Use Column Transformer and Pipelines to encode the Input and Output Variables . Scale the Numerical columns using MinMaxScaler.

In [20]:
numerical_ix = X_train_final.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X_train_final.select_dtypes(include=['object', 'bool']).columns

In [21]:
encoding_steps = [('cat', OrdinalEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=encoding_steps)

In [22]:
pipeline = Pipeline(steps=[('prep',col_transform)])

In [23]:
train_final = pd.concat([X_train_final,y_train],axis=1)

In [24]:
test_final = pd.concat([X_test_final,y_test],axis=1)

## Save the Prepared Data to the project filesystem

In [25]:
vars_and_utils.save_data_in_filesystem(df=train_final, filename=vars_and_utils.train_data_path)

File data/train_gcr.csv persisted successfully as csv


In [26]:
vars_and_utils.save_data_in_filesystem(df=test_final, filename=vars_and_utils.test_data_path)

File data/test_gcr.csv persisted successfully as csv


In [27]:
vars_and_utils.save_data_in_filesystem(df=pipeline, filename=vars_and_utils.pipeline_path)

File data/feature_encode.pkl pickled successfully


## Custom succes check: Check if files have been succesfully created 

In [28]:
data_prep_done = os.path.exists(vars_and_utils.train_data_path) and os.path.exists(vars_and_utils.test_data_path) and os.path.exists(vars_and_utils.pipeline_path)
data_prep_done

True

## Register the output variables for the next pipeine stage
every notebook outputs a "was_successful" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.
If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI.

In [29]:
preparation_params = {}
preparation_params['was_succesfull'] = data_prep_done

pipelines_client = WSPipelines.from_token(TOKEN)
pipelines_client.store_results(preparation_params)

Running outside of Watson Studio Pipeline - storing results in the local filesystem for testing purposes...

  output paths:
    - "was_succesfull": .ibm_watson_studio_pipelines/results/was_succesfull




<ibm_cloud_sdk_core.detailed_response.DetailedResponse at 0x7fd79944c070>