## Connection and Data Validation Notebook

## Table of Contents
* [Check for Training Data in Project Space](#DataCheck)
    * [Load the Training Data from COS if the file does not exist in the project space](#section_1_1)
    
    * [Check the connection and data loading](#section_1_2)
  
* [Data Validation](#chapter2)
    * [Split the Data](#Optional)

    * [Generate Training Stats on both Splits](#section_2_2)
    * [Infer Schema on both Splits](#section_2_3) 
    * [Check for anomalies](#section_2_4) 
    * [Return a boolean to validate the tests](#section_3_1) 


## Imports

In [None]:
from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import tensorflow_data_validation as tfdv
import numpy as np
import pandas as pd

from ibm_watson_studio_pipelines import WSPipelines
import ibm_boto3

import logging
import os, types
import warnings

warnings.filterwarnings("ignore")

These environment variables are set in WS Pipelines

In [None]:
CLOUD_API_KEY = os.getenv("cloud_api_key")
training_file_name = os.getenv("training_file_name")

### Load the Credentials


### Succeeding cell contains the credentials for MLOps COS. Please enter those before running the cell.
```
## PROJECT COS 
AUTH_ENDPOINT = "https://iam.cloud.ibm.com/oidc/token"
ENDPOINT_URL = "https://s3.private.us.cloud-object-storage.appdomain.cloud"
API_KEY_COS = "xxx"
BUCKET_PROJECT_COS = "mlops-donotdelete-pr-qxxcecxi1dtw94"

##MLOPS COS
ENDPOINT_URL_MLOPS = "https://s3.jp-tok.cloud-object-storage.appdomain.cloud"
API_KEY_MLOPS = "xxx"
CRN_MLOPS = "xxx"
BUCKET_MLOPS  = "mlops-asset"

```

In [None]:
# The code was removed by Watson Studio for sharing.

## Check for Training Data in Project Space

In [None]:
def check_for_file_in_project_cos(key):
    
    try:
        def __iter__(self): return 0
        client_5e28c0cc7d7249b7be0b4e0606310e4e = ibm_boto3.client(service_name='s3',
                                                                   ibm_api_key_id=API_KEY_COS,
                                                                   ibm_auth_endpoint=AUTH_ENDPOINT,
                                                                   config=Config(signature_version='oauth'),
                                                                   endpoint_url=ENDPOINT_URL)

        body = client_5e28c0cc7d7249b7be0b4e0606310e4e.get_object(Bucket=BUCKET_PROJECT_COS,Key=key)['Body']
        # add missing __iter__ method, so pandas accepts body as file-like object
        if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
        return body
    except Exception as e :
        print(e)
        return False
    
    
def read_data_from_mlops_cos(key):
    def __iter__(self): return 0
    MLOPS_DATA_STORE_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)

    body = MLOPS_DATA_STORE_client.get_object(Bucket=BUCKET_MLOPS, Key=key)['Body']
    # add missing __iter__ method, so pandas accepts body as file-like object
    if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

    gcf_df = pd.read_csv(body)
    return gcf_df
    
    
def load_data_from_project(key):
    body = check_for_file_in_project_cos(key)
    if body:
        gcf_df = pd.read_csv(body)
        return gcf_df
    else:
        print("\n")
        print(f"{key} file is probably not in project. Loading File from MLOps COS Bucket.")
        gcf_df = read_data_from_mlops_cos(key)
        return gcf_df

## Load the Training Data from COS if the file doesn't exist

In [None]:
gcr_df = load_data_from_project(training_file_name)

## Encode for ease of use with OpenScale
gcr_df['Risk'] = gcr_df['Risk'].map({'Risk':1,'No Risk':0})
gcr_df.head()

## Data Validation 

In [None]:
@dataclass
class Datavalidation:
    """
    
    Data Validation Class
    
    """
    dataframe : pd.DataFrame
    mask_per :int
    
    
    def split_data(self,seed=32):
        """
        Split Data into Train and Test Splits
        
        """
        np.random.seed(seed)
        mask = np.random.rand(len(self.dataframe)) <= self.mask_per
        training_data = gcr_df[mask]
        testing_data = gcr_df[~mask]

        print(f"No. of training examples: {training_data.shape[0]}")
        print(f"No. of testing examples: {testing_data.shape[0]}")
        
        return training_data, testing_data
    
    
    def save_data_in_cos(self,df,filename,key):
        """
        
        Save Data in IBM Cloud Object Storage
        
        """
        try:
            df.to_csv(filename,index=False)
            mlops_res = ibm_boto3.resource(
                service_name='s3',
                ibm_api_key_id=API_KEY_MLOPS,
                ibm_service_instance_id=CRN_MLOPS,
                ibm_auth_endpoint=AUTH_ENDPOINT,
                config=Config(signature_version='oauth'),
                endpoint_url=ENDPOINT_URL_MLOPS)

            mlops_res.Bucket(BUCKET_MLOPS).upload_file(filename,key)
            print(f"File {filename} uploaded successfully")
        except Exception as e:
            print(e)
            print("File upload for {filename} failed")
    
    
    def generate_statistics(self,df):
        """
        
        Generate Statistics on a given Dataframe
        
        """
        train_stats = tfdv.generate_statistics_from_dataframe(df)
        tfdv.visualize_statistics(train_stats)
        return train_stats
    
    def inferSchema(self,stats):
        
        """
        InferSchema on a given Dataframe
        
        """
        schema = tfdv.infer_schema(statistics=stats)
        tfdv.display_schema(schema=schema)
        return schema
    
    def compare_statistics(self,lhs,rhs):
        """
        
        Compare Statistics between a test dataframe and reference Schema
        
        """
        # Compare evaluation data with training data
        tfdv.visualize_statistics(lhs_statistics=lhs, rhs_statistics=rhs,
                                  lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')
        
        
    def check_for_anomalies(self,testable_stats,ref_schema):
        """
        
        Check for any anomalies based on statistics and schema and values
        
        """
        anomalies = tfdv.validate_statistics(statistics=testable_stats, schema=ref_schema)
        tfdv.display_anomalies(anomalies)
        if len(anomalies.anomaly_info.items()) > 0:
            logger.error("Anomalies found in dataset...")
            logger.error(str(self.anomalies.anomaly_info.items()))
            return True
        else:
            return False

def check_if_file_exists(filename):
    mlops_client = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=API_KEY_MLOPS,
        ibm_service_instance_id=CRN_MLOPS,
        ibm_auth_endpoint=AUTH_ENDPOINT,
        config=Config(signature_version='oauth'),
        endpoint_url=ENDPOINT_URL_MLOPS)
    
    for key in mlops_client.list_objects(Bucket=BUCKET_MLOPS)['Contents']:
        files = key['Key']
        if files == filename:
            return True
    return False

###  Split Data into Train and Eval Splits to Check for Consistency

In [None]:
classvalidate = Datavalidation(dataframe=gcr_df,mask_per=0.8) 

training_data, testing_data = classvalidate.split_data()


## Generate Training Stats on both Splits

In [None]:
train_stats = classvalidate.generate_statistics(training_data)

In [None]:
test_stats = classvalidate.generate_statistics(testing_data)

## Infer Training Data Schema

In [None]:
train_schema = classvalidate.inferSchema(train_stats)

## Infer Test Data Schema

In [None]:
test_schema = classvalidate.inferSchema(test_stats)

## Compare Eval and Train Data 

In [None]:
classvalidate.compare_statistics(lhs=test_stats,rhs=train_stats)

## Check For Data Anomalies 

### Check eval data for errors by validating the eval data stats using the previously inferred schema.

In [None]:
anomaly_status = classvalidate.check_for_anomalies(test_stats,train_schema)
anomaly_status

## Save Train and Test Data for Data Preparation Stage

In [None]:
if not anomaly_status:
    classvalidate.save_data_in_cos(df=training_data,filename="train_gcr.csv",key="train_gcr.csv")
    classvalidate.save_data_in_cos(df=testing_data,filename="test_gcr.csv",key="test_gcr.csv")

## Check if files Exists in COS

In [None]:
files_copied_in_cos = check_if_file_exists("train_gcr.csv") and check_if_file_exists("test_gcr.csv")
files_copied_in_cos

## Register a Boolean Variable in WS Pipeline

In [None]:
validation_params = {}
validation_params['anomaly_status'] = anomaly_status
validation_params['files_copied_in_cos'] = files_copied_in_cos

In [None]:
pipelines_client = WSPipelines.from_apikey(apikey=CLOUD_API_KEY)
pipelines_client.store_results(validation_params)