## Connection and Data Validation Notebook

## Table of Contents
* [Check for Training Data in Project Space](#DataCheck)
    * [Load the Training Data from Db2 if it does not exist in the project space](#section_1_1)
    
    * [Check the connection and data loading](#section_1_2)
  
* [Data Validation](#chapter2)
    * [Split the Data](#Optional)

    * [Generate Training Stats on both Splits](#section_2_2)
    * [Infer Schema on both Splits](#section_2_3) 
    * [Check for anomalies](#section_2_4) 
    * [Return a boolean to validate the tests](#section_3_1) 


## Imports

In [None]:
!pip install tensorflow-data-validation

In [None]:
!python3 --version

In [None]:
# this is a workaround as long as there are issues with custom python enviroments in multiple projects
!pip install ibm_watson_studio_pipelines

In [32]:
from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import tensorflow_data_validation as tfdv
import numpy as np
import pandas as pd
from ibm_watson_studio_pipelines import WSPipelines
import logging
import os, types
import warnings
import requests

warnings.filterwarnings("ignore")

### Load the Credentials


These environment variables are set in WS Pipelines

In [33]:
TOKEN = os.getenv("USER_ACCESS_TOKEN")

In [34]:
training_file_path = "german_credit_data_biased_training.csv"

## Check for Training Data in Project Space

In [39]:
def download_data_to_filesystem(training_file_path):
        
    url = "https://raw.githubusercontent.com/IBM/monitor-wml-model-with-watson-openscale/master/data/german_credit_data_biased_training.csv"
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        with open(training_file_path, "wb") as file:
            file.write(response.content)
        print("Downloaded and saved as "+training_file_path)
    else:
        print("Failed to download the CSV file. Status code:", response.status_code)
    

def load_data_from_db2():
    '''
    currently not implemented due to issues with the flight service
    '''
    # data_request = {
    #         'connection_name': """DB2_DATA""",
    #         'interaction_properties': {
    #             'select_statement': 'SELECT * FROM "CUSTOMER_DATA"."GERMAN_CREDIT_RISK_TRAINING" FETCH FIRST 5000 ROWS ONLY'
    #         }
    #     }

    # read_client = itcfs.get_flight_client()


    # flightInfo = itcfs.get_flight_info(read_client, nb_data_request=data_request)

    # df = itcfs.read_pandas_and_concat(read_client, flightInfo, timeout=240)
    # create empty dataframe to have a valid return type
    
    # throw an exception to signal that this functionality is not available
    raise Exception("Data not available")


In [40]:
def load_data():
    try:
        return load_data_from_db2()
    except:
        print("Error while loading data from db2. downloading csv file to filesystem instead")

    if os.path.isfile(training_file_path):
        print("File already exists")
    else:
        download_data_to_filesystem(training_file_path)
    return pd.read_csv(training_file_path)

## Load the Training Data from Db2 if the file doesn't exist

In [42]:

gcr_df = load_data()

## Encode for ease of use with OpenScale
gcr_df['Risk'] = gcr_df['Risk'].map({'Risk':1,'No Risk':0})
gcr_df.head()

Error while loading data from db2. downloading csv file to filesystem instead
File already exists


Unnamed: 0,CheckingStatus,LoanDuration,CreditHistory,LoanPurpose,LoanAmount,ExistingSavings,EmploymentDuration,InstallmentPercent,Sex,OthersOnLoan,...,OwnsProperty,Age,InstallmentPlans,Housing,ExistingCreditsCount,Job,Dependents,Telephone,ForeignWorker,Risk
0,0_to_200,31,credits_paid_to_date,other,1889,100_to_500,less_1,3,female,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,0
1,less_0,18,credits_paid_to_date,car_new,462,less_100,1_to_4,2,female,none,...,savings_insurance,37,stores,own,2,skilled,1,none,yes,0
2,less_0,15,prior_payments_delayed,furniture,250,less_100,1_to_4,2,male,none,...,real_estate,28,none,own,2,skilled,1,yes,no,0
3,0_to_200,28,credits_paid_to_date,retraining,3693,less_100,greater_7,3,male,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,0
4,no_checking,28,prior_payments_delayed,education,6235,500_to_1000,greater_7,3,male,none,...,unknown,57,none,own,2,skilled,1,none,yes,1


## Data Validation 

In [43]:
@dataclass
class Datavalidation:
    """
    
    Data Validation Class
    
    """
    dataframe : pd.DataFrame
    mask_per :int
    
    
    def split_data(self,seed=32):
        """
        Split Data into Train and Test Splits
        
        """
        np.random.seed(seed)
        mask = np.random.rand(len(self.dataframe)) <= self.mask_per
        training_data = gcr_df[mask]
        testing_data = gcr_df[~mask]

        print(f"No. of training examples: {training_data.shape[0]}")
        print(f"No. of testing examples: {testing_data.shape[0]}")
        
        return training_data, testing_data
    
    # TODO: Replace with Db2/fileystem
    def save_data_in_filesystem(self,df,filename):
        """
        Save Data in Filesystem

        Passed filename should involve path

        """
        try:
            df.to_csv(filename,index=False)
            print(f"File {filename} persisted successfully")
        except Exception as e:
            print(e)
            print(f"File serialization for {filename} failed")
    
    def generate_statistics(self,df):
        """
        
        Generate Statistics on a given Dataframe
        
        """
        train_stats = tfdv.generate_statistics_from_dataframe(df)
        tfdv.visualize_statistics(train_stats)
        return train_stats
    
    def inferSchema(self,stats):
        
        """
        InferSchema on a given Dataframe
        
        """
        schema = tfdv.infer_schema(statistics=stats)
        tfdv.display_schema(schema=schema)
        return schema
    
    def compare_statistics(self,lhs,rhs):
        """
        
        Compare Statistics between a test dataframe and reference Schema
        
        """
        # Compare evaluation data with training data
        tfdv.visualize_statistics(lhs_statistics=lhs, rhs_statistics=rhs,
                                  lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')
        
        
    def check_for_anomalies(self,testable_stats,ref_schema):
        """
        
        Check for any anomalies based on statistics and schema and values
        
        """
        anomalies = tfdv.validate_statistics(statistics=testable_stats, schema=ref_schema)
        tfdv.display_anomalies(anomalies)
        if len(anomalies.anomaly_info.items()) > 0:
            logger.error("Anomalies found in dataset...")
            logger.error(str(self.anomalies.anomaly_info.items()))
            return True
        else:
            return False

###  Split Data into Train and Eval Splits to Check for Consistency

In [44]:
classvalidate = Datavalidation(dataframe=gcr_df,mask_per=0.8) 

training_data, testing_data = classvalidate.split_data()

No. of training examples: 3995
No. of testing examples: 1005


## Generate Training Stats on both Splits

In [45]:
train_stats = classvalidate.generate_statistics(training_data)

In [46]:
test_stats = classvalidate.generate_statistics(testing_data)

## Infer Training Data Schema

In [47]:
train_schema = classvalidate.inferSchema(train_stats)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'CheckingStatus',STRING,required,,'CheckingStatus'
'LoanDuration',INT,required,,-
'CreditHistory',STRING,required,,'CreditHistory'
'LoanPurpose',STRING,required,,'LoanPurpose'
'LoanAmount',INT,required,,-
'ExistingSavings',STRING,required,,'ExistingSavings'
'EmploymentDuration',STRING,required,,'EmploymentDuration'
'InstallmentPercent',INT,required,,-
'Sex',STRING,required,,'Sex'
'OthersOnLoan',STRING,required,,'OthersOnLoan'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'CheckingStatus',"'0_to_200', 'greater_200', 'less_0', 'no_checking'"
'CreditHistory',"'all_credits_paid_back', 'credits_paid_to_date', 'no_credits', 'outstanding_credit', 'prior_payments_delayed'"
'LoanPurpose',"'appliances', 'business', 'car_new', 'car_used', 'education', 'furniture', 'other', 'radio_tv', 'repairs', 'retraining', 'vacation'"
'ExistingSavings',"'100_to_500', '500_to_1000', 'greater_1000', 'less_100', 'unknown'"
'EmploymentDuration',"'1_to_4', '4_to_7', 'greater_7', 'less_1', 'unemployed'"
'Sex',"'female', 'male'"
'OthersOnLoan',"'co-applicant', 'guarantor', 'none'"
'OwnsProperty',"'car_other', 'real_estate', 'savings_insurance', 'unknown'"
'InstallmentPlans',"'bank', 'none', 'stores'"
'Housing',"'free', 'own', 'rent'"


## Infer Test Data Schema

In [48]:
test_schema = classvalidate.inferSchema(test_stats)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'CheckingStatus',STRING,required,,'CheckingStatus'
'LoanDuration',INT,required,,-
'CreditHistory',STRING,required,,'CreditHistory'
'LoanPurpose',STRING,required,,'LoanPurpose'
'LoanAmount',INT,required,,-
'ExistingSavings',STRING,required,,'ExistingSavings'
'EmploymentDuration',STRING,required,,'EmploymentDuration'
'InstallmentPercent',INT,required,,-
'Sex',STRING,required,,'Sex'
'OthersOnLoan',STRING,required,,'OthersOnLoan'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'CheckingStatus',"'0_to_200', 'greater_200', 'less_0', 'no_checking'"
'CreditHistory',"'all_credits_paid_back', 'credits_paid_to_date', 'no_credits', 'outstanding_credit', 'prior_payments_delayed'"
'LoanPurpose',"'appliances', 'business', 'car_new', 'car_used', 'education', 'furniture', 'other', 'radio_tv', 'repairs', 'retraining', 'vacation'"
'ExistingSavings',"'100_to_500', '500_to_1000', 'greater_1000', 'less_100', 'unknown'"
'EmploymentDuration',"'1_to_4', '4_to_7', 'greater_7', 'less_1', 'unemployed'"
'Sex',"'female', 'male'"
'OthersOnLoan',"'co-applicant', 'guarantor', 'none'"
'OwnsProperty',"'car_other', 'real_estate', 'savings_insurance', 'unknown'"
'InstallmentPlans',"'bank', 'none', 'stores'"
'Housing',"'free', 'own', 'rent'"


## Compare Eval and Train Data 

In [49]:
classvalidate.compare_statistics(lhs=test_stats,rhs=train_stats)

## Check For Data Anomalies 

### Check eval data for errors by validating the eval data stats using the previously inferred schema.

In [54]:
anomaly_status = classvalidate.check_for_anomalies(test_stats,train_schema)
anomaly_status

False

## Save Train and Test Data for Data Preparation Stage

In [53]:
train_data_path = "train_gcr.csv"
test_data_path = "test_gcr.csv"

In [55]:
# TODO: Replace with Db2/fileystem
if not anomaly_status:
    classvalidate.save_data_in_filesystem(df=training_data,filename=train_data_path)
    classvalidate.save_data_in_filesystem(df=testing_data,filename=test_data_path)

File train_gcr.csv persisted successfully
File test_gcr.csv persisted successfully


## Check if files Exists in COS

In [59]:
# TODO: Replace with Db2/fileystem
files_copied_in_cos = check_for_file_in_filesystem(train_data_path) and check_for_file_in_filesystem(test_data_path)
files_copied_in_cos

True

## Register a Boolean Variable in WS Pipeline

In [57]:
validation_params = {}
validation_params['anomaly_status'] = anomaly_status
validation_params['files_copied_in_cos'] = files_copied_in_cos
validation_params['train_data_filename'] = train_data_filename
validation_params['test_data_filename'] = test_data_filename

In [58]:
# pipelines_client = WSPipelines.from_token(token=TOKEN)
pipelines_client = WSPipelines.from_token(TOKEN)
pipelines_client.store_results(validation_params)

Running outside of Watson Studio Pipeline - storing results in the local filesystem for testing purposes...

  output paths:
    - "anomaly_status": .ibm_watson_studio_pipelines/results/anomaly_status
    - "files_copied_in_cos": .ibm_watson_studio_pipelines/results/files_copied_in_cos
    - "train_data_filename": .ibm_watson_studio_pipelines/results/train_data_filename
    - "test_data_filename": .ibm_watson_studio_pipelines/results/test_data_filename


<ibm_cloud_sdk_core.detailed_response.DetailedResponse at 0x7f101ed3ab90>