# Standardize and Normalize Dashlink Telemetry file

This is a code to standardize and normalize the time series data by time stamp into z-scores.

## S3 Data read logic
This is standard functions to read bucket object keys for retrieving the data

In [1]:
from scipy import io
import pandas as pd
import numpy as np
import csv
import matplotlib as mp
import matplotlib.pyplot as plt
from numpy.lib import recfunctions
import datetime
import numpy.lib.recfunctions as recfn
import boto3
import sys
from IPython.display import clear_output
s3 = boto3.resource('s3')

In [2]:
def s3_bucket_object_keys(bucket_name= 'iiaweb-s3-io-practice-bucket'):
    bucket = s3.Bucket(bucket_name)
    key_list=[]
    for key in bucket.objects.filter(Prefix='652/1'):
        key_list.append(key)
    return(key_list)

In [3]:
from io import BytesIO
def load_file(s3_key, bucketname= 'iiaweb-s3-io-practice-bucket'):
    s3 = boto3.resource('s3')
    obj = s3.Object(bucketname, s3_key.key)
    inFile = obj.get()['Body'].read()
    gfile = BytesIO(inFile)
    return(gfile)

In [7]:
start = datetime.datetime.now()
key_list = s3_bucket_object_keys('iiaweb-s3-io-practice-bucket')
end = datetime.datetime.now()
print("Time Taken to run the funtion ",str(end-start))

Time Taken to run the funtion  0:00:00.291515


In [27]:
from scipy import stats
def standardize_normalize(key):
    '''
    Argument:
    key: S3 Bucket key for csv file
    Transformations:
    1. Read Data Frame
    2. Selecting intergers and Floats
    3. Filling Null Values
    4. Z-Score
    5. Drop Invalid values e.g. for near constant invariant fields
    Output: Valid Z score of Pandas Datafrane column
    '''
    gzfile= load_file(key)
    df = pd.read_csv(gzfile, compression='gzip')
    df1= df.select_dtypes(exclude=['object'])
    df1.fillna(0, inplace= True)
    df1= (df1 - df1.mean())/df1.std()
    df1.dropna(axis=1, inplace= True)
    return(df1)
    

## Example

In [28]:
key= key_list[0]

In [25]:
df2=standardize_normalize(key)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [26]:
df2.describe()

Unnamed: 0,EGT_1,EGT_2,EGT_3,EGT_4,MW,ALT,RALT,ALTR,FQTY_1,OIT_1,...,FLAP,PTRM,BPGR_2,BPYR_1,BPYR_2,OIPL,FIRE_2,FIRE_3,FIRE_4,FRMC
count,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,...,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0,3943.0
mean,2.740221e-16,4.994117e-15,-7.781177e-15,-8.381381e-15,1.651499e-15,1.286423e-14,8.390631e-15,7.5235e-17,-1.263565e-15,1.396184e-15,...,-8.926328e-15,-2.474491e-16,-1.624071e-15,3.821907e-15,-6.879606e-16,5.49452e-16,-2.596322e-16,5.6894e-16,5.590394e-16,-1.000693e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.003396,-0.878013,-0.5591412,-0.3026665,-0.05524386,-1.984432,-1.547819,-6.465333,-1.441202,-15.33766,...,-4.811735,-22.17479,-7.810003,-8.416814,-0.02252458,-0.9984102,-0.02252458,-0.03186265,-0.03186265,-1.728319
25%,-1.043959,-0.8729192,-0.5557035,-0.2931127,-0.05524386,-0.7857473,0.2498394,-0.8503007,-0.8786365,-0.4270525,...,-0.7620719,0.03862661,-0.1377619,-0.7992431,-0.02252458,-0.5525743,-0.02252458,-0.03186265,-0.03186265,-0.8639405
50%,0.8504046,-0.8703723,-0.5557035,-0.2931127,-0.05524386,-0.3861856,0.2498394,0.2727057,-0.03478877,0.02071346,...,-0.7620719,0.04954778,-0.1377619,-0.2268244,-0.02252458,-0.5525743,-0.02252458,-0.03186265,-0.03186265,0.0004384372
75%,0.8714141,1.246101,0.4309058,-0.2883359,-0.05524386,0.8124996,0.2498394,0.8342089,0.9028198,0.5580322,...,1.262759,0.04954778,-0.1377619,1.006078,-0.02252458,-0.1067383,-0.02252458,-0.03186265,-0.03186265,0.8648174
max,1.788832,1.607761,2.77883,4.880232,18.09697,2.011185,2.047498,3.080222,1.746667,0.8266916,...,3.287591,0.1805998,1.396687,1.020755,44.38468,2.568278,44.38468,31.37675,31.37675,1.729196
