## Data Preparation Notebook

## Imports

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from botocore.client import Config
from ibm_watson_studio_pipelines import WSPipelines
import matplotlib.pyplot as plt
import seaborn as sns
import heapq
import pickle
import os, types
import pandas as pd
import ibm_boto3

### Load the validated training and test data from IBM COS 

```
## PROJECT COS 
AUTH_ENDPOINT = "https://iam.cloud.ibm.com/oidc/token"
ENDPOINT_URL = "https://s3.private.us.cloud-object-storage.appdomain.cloud"
API_KEY_COS = "xxx"
BUCKET_PROJECT_COS = "mlops-donotdelete-pr-qxxcecxi1dtw94"


##MLOPS COS
ENDPOINT_URL_MLOPS = "https://s3.jp-tok.cloud-object-storage.appdomain.cloud"
API_KEY_MLOPS = "xxx"
CRN_MLOPS = "xxx"
BUCKET_MLOPS  = "mlops-asset"
```

In [None]:
# The code was removed by Watson Studio for sharing.

## Pipeline Params

In [None]:
CLOUD_API_KEY = os.getenv("cloud_api_key")

In [None]:
project_cos_credentials = {'API_KEY':API_KEY_COS,
                          'CRN':None,
                          'AUTH_ENDPOINT':AUTH_ENDPOINT,
                           'ENDPOINT_URL':ENDPOINT_URL,
                           'BUCKET':BUCKET_PROJECT_COS
                          }

mlops_cos_credentials = {'API_KEY':API_KEY_MLOPS,
                          'CRN':CRN_MLOPS,
                          'AUTH_ENDPOINT':AUTH_ENDPOINT,
                           'ENDPOINT_URL':ENDPOINT_URL_MLOPS,
                           'BUCKET':BUCKET_MLOPS
                          }


###  Read and Write Utility

In [None]:
def read_data_from_mlops_cos(key):
    def __iter__(self): return 0
    MLOPS_DATA_STORE_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)

    body = MLOPS_DATA_STORE_client.get_object(Bucket=BUCKET_MLOPS, Key=key)['Body']
    # add missing __iter__ method, so pandas accepts body as file-like object
    if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

    gcf_df = pd.read_csv(body)
    return gcf_df

def save_data_in_cos(df,filename,key,credentials,pipe=False):
    """

    Save Data in IBM Cloud Object Storage

    """
    try:
        
        if pipe:
            with open (filename,'wb') as f:
                pickle.dump(pipeline,f)
        else:
            df.to_csv(filename,index=False)
        mlops_res = ibm_boto3.resource(
            service_name='s3',
            ibm_api_key_id=credentials['API_KEY'],
            ibm_auth_endpoint=credentials['AUTH_ENDPOINT'],
            config=Config(signature_version='oauth'),
            endpoint_url=credentials['ENDPOINT_URL'])

        mlops_res.Bucket(credentials['BUCKET']).upload_file(filename,key)
        print(f"File {filename} uploaded successfully")
    except Exception as e:
        print(e)
        print("File upload for {filename} failed")

        
def check_if_file_exists(filename):
    mlops_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)
    
    for key in mlops_client.list_objects(Bucket=BUCKET_MLOPS)['Contents']:
        files = key['Key']
        if files == filename:
            return True
    return False


## Train_Data 

In [None]:
train_data = read_data_from_mlops_cos('train_gcr.csv')
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
object_df = train_data.select_dtypes('O')
object_df.head()

In [None]:
object_cols = list(set(object_df.columns.tolist()) - set(['Risk']))
object_cols

In [None]:
numerical_columns = [col for col in train_data.columns.tolist() if col not in object_cols and col!='Risk']

## Load Test Data 

In [None]:
test_data = read_data_from_mlops_cos('test_gcr.csv')
test_data.head()

## Split X and Y 

In [None]:
y_train = train_data['Risk']

X_train = train_data.drop("Risk",axis=1)


y_test = test_data['Risk']

X_test = test_data.drop("Risk",axis=1)



## Categorcial Feature Analysis 

In [None]:
def prepare_input_data(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc


def prepare_output_data(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc


def select_best_chi2_features(X_train, y_train, X_test,score_func=chi2):
    featureselector = SelectKBest(score_func=chi2, k='all')
    featureselector.fit(X_train, y_train)
    X_train_best_feat = featureselector.transform(X_train)
    X_test_best_feat= featureselector.transform(X_test)
    return X_train_best_feat, X_test_best_feat, featureselector


def select_best_mutualinf_features(X_train, y_train, X_test,k=5):
    featureselector = SelectKBest(score_func=mutual_info_classif, k=k)
    featureselector.fit(X_train, y_train)
    X_train_best_feat = fs.transform(X_train)
    X_test_best_feat= fs.transform(X_test)
    return X_train_best_feat, X_test_best_feat, featureselector

def plot_scores():
    plt.figure(figsize=(14, 12))
    plt.subplot(221)

    ax1 = sns.barplot([i for i in range(len(fs.scores_))], fs.scores_)
    ax1.set_title("Chi2 Importance Scores", fontsize=20)
    ax1.set_xlabel("Features",fontsize=15)
    ax1.set_ylabel("Chi2 Scores",fontsize=15)
    
    
def get_top_k_catgeorical(fs,train_cat,k=10):
    fs_score_map = {}
    for i in range(len(fs.scores_)):
        #print(f"Feature {train_cat.columns.tolist()[i]} {fs.scores_[i]}")
        fs_score_map[train_cat.columns.tolist()[i]] = fs.scores_[i]
        
    k_keys_sorted_by_values = heapq.nlargest(k, fs_score_map, key=fs_score_map.get)
    
    return k_keys_sorted_by_values
    
    


## Encode and shape the Variables 

In [None]:
X_train_enc, X_test_enc = prepare_input_data(X_train[object_cols], X_test[object_cols])

y_train_enc, y_test_enc = prepare_output_data(y_train, y_test)

X_train_fs, X_test_fs, fs = select_best_chi2_features(X_train_enc, y_train_enc, X_test_enc)

plot_scores()


## Top K Categorical Features  based on Chi2

In [None]:
top_k_cat = get_top_k_catgeorical(fs,X_train[object_cols])
top_k_cat

## Top K Categorical Features  based on Mutual Information Feature Selection

In [None]:
X_train_enc_mf, X_test_enc_mf = prepare_input_data(X_train[object_cols], X_test[object_cols])

y_train_enc_mf, y_test_enc_mf = prepare_output_data(y_train, y_test)

X_train_fs_mf, X_test_fs_mf, fs_mf = select_best_chi2_features(X_train_enc_mf, y_train_enc_mf, X_test_enc_mf)

plot_scores()

In [None]:
top_k_cat_mf = get_top_k_catgeorical(fs_mf,X_train[object_cols])
top_k_cat_mf

In [None]:
union_features = list(set(top_k_cat+top_k_cat_mf))
if "Sex" not in union_features:
    union_features.append("Sex")
union_features

## Filter the Top K Categorical features and Merge to Original Train and Test Dataframes

In [None]:
X_train_object_filtered = X_train[union_features]
X_test_object_filtered = X_test[union_features]

X_train_final = pd.concat([X_train[numerical_columns],X_train_object_filtered],axis=1)

X_test_final = pd.concat([X_test[numerical_columns],X_test_object_filtered],axis=1)

## Use Column Transformer and Pipelines to encode the Input and Output Variables . Scale the Numerical columns using MinMaxScaler.

In [None]:
numerical_ix = X_train_final.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X_train_final.select_dtypes(include=['object', 'bool']).columns

In [None]:
encoding_steps = [('cat', OrdinalEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=encoding_steps)

In [None]:
pipeline = Pipeline(steps=[('prep',col_transform)])

In [None]:
train_final = pd.concat([X_train_final,y_train],axis=1)

In [None]:
test_final = pd.concat([X_test_final,y_test],axis=1)

In [None]:
#encoded_train = pd.DataFrame(pipeline.fit_transform(X_train_final),columns=X_train_final.columns)

In [None]:
#encoded_test = pd.DataFrame(pipeline.transform(X_test_final),columns=X_test_final.columns)

## Save the Prepared Data to IBM COS

In [None]:
save_data_in_cos(train_final,"train_tfr.csv","train_tfr.csv",mlops_cos_credentials)

In [None]:
save_data_in_cos(test_final,"test_tfr.csv","test_tfr.csv",mlops_cos_credentials)

In [None]:
save_data_in_cos(pipeline,"feature_encode.pickle","feature_encode.pickle",mlops_cos_credentials,pipe=True)

## Check if files have been copied 

In [None]:
data_prep_done = check_if_file_exists("train_tfr.csv") and check_if_file_exists("test_tfr.csv") and check_if_file_exists("feature_encode.pickle")
data_prep_done

## Store Params in WS Pipelines

In [None]:
preparation_params = {}
preparation_params['data_prep_done'] = data_prep_done

pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(preparation_params)