In [101]:
from scipy import io
import pandas as pd
import numpy as np
import csv
import matplotlib as mp
import matplotlib.pyplot as plt
from numpy.lib import recfunctions
import datetime
import numpy.lib.recfunctions as recfn
import boto3
import sys
from IPython.display import clear_output
s3 = boto3.resource('s3')

In [110]:
# Presumably these aircrafts have 4 engines
eng_no =[1,2,3,4]
# These are all the engine variables I could find, ESN is Engine serial number
eng_param = ['ESN', 'EGT', 'OIT', 'FF', 'N1', 'VIB', 'FQTY', 'N2', 'OIP','ECYC','FIRE']
# My hypothesis is that instead of ingesting a whole time series denoting the fligh, some key metrics 
# may make representative features. Ideally we should seek to add metrics on trend, stantionarity and heteroscadascity
win_metrics= ['mean','nunique','var', 'max', 'min']
feature_name = []
for e in eng_param[1:]:
    for w in win_metrics:
        feature_name.append(e+'_'+w)
feature_name.insert(0, 'ESN')
        
eng_var = {}

for e in eng_no:
    eng_cols =[]
    for p in eng_param:
        eng_cols.append(p+'_'+str(e))
        eng_var[e]= eng_cols
        
# Final Objective to generate a feature set with the following features:
# ESN as index, S3 Key with slight info and such and 10x 4= 40 features for each engine. 
# So every flight should result in 4 rows at a ESN level


In [111]:
def s3_bucket_object_keys(bucket_name= 'iiaweb-s3-io-practice-bucket'):
    bucket = s3.Bucket(bucket_name)
    key_list=[]
    for key in bucket.objects.filter(Prefix='652/1'):
        key_list.append(key)
    return(key_list)

In [112]:
from io import BytesIO
def load_file(s3_key, bucketname= 'iiaweb-s3-io-practice-bucket'):
    s3 = boto3.resource('s3')
    obj = s3.Object(bucketname, s3_key.key)
    inFile = obj.get()['Body'].read()
    gfile = BytesIO(inFile)
    return(gfile)

In [113]:
key_list = s3_bucket_object_keys()

In [114]:
DATETIMEFORMAT= '%Y-%m-%d %H:%M:%S'
# Haven't changed the date time into pandas datetime
# This constant serves 2 purposes
# 1. If you have a flgiht of higher duration you can assume that the engines and the aircraft ran smoothly
# 2. A higher duration of flight reduces the number of records. But 3 is too high.
LENOFFLIGHT= 1

In [121]:
# The only variable that is understood well is the LGDN landing gear. For a normal flight there should be 
# a up/ down sequence. So the variable must have 2 values to be normal.
# Thne code below looks for normal flights with 3 hours or more duration. 
# Loops through the engines and creates a df of 4 records for each selected flights
# Finally writes all flights selected x 4 records into a df with ESN as the index
flght_engines=[]
for ind, key in enumerate(key_list):
    eng_dfs=[]
    gzfile= load_file(key)
    df = pd.read_csv(gzfile, compression='gzip')
    diff = datetime.datetime.strptime(max(df.timestamp), DATETIMEFORMAT)- datetime.datetime.strptime(min(df.timestamp), DATETIMEFORMAT)
    if diff.seconds/3600 >= LENOFFLIGHT and len(df.LGDN.unique()) ==2:
        lnd_gear_chg= (df['LGDN'].shift()== 0 ) & (df['LGDN']==1)
        up_idx= lnd_gear_chg[lnd_gear_chg].index[0]
        dwn_idx= lnd_gear_chg[lnd_gear_chg].index[1]
        flght_seg= {'take_off': df.loc[:up_idx], 'in_flight': df.loc[up_idx: dwn_idx], 'landing': df.loc[dwn_idx:]}
        eng_dfs=[]
        for k, v in eng_var.items():
            seg_dfs=[]
            for flgh, df1 in flght_seg.items():
                time_agg = df1.groupby(v[0])[v[1:]].agg(win_metrics)
#                 time_agg.columns = ['_'.join(col).strip() for col in time_agg.columns.values]
#                 seg_features= [flgh+'_'+f for f in time_agg.columns.tolist()]
                time_agg.columns= feature_name[1:]
                seg_dfs.append(time_agg)
            engine_rec= pd.concat(seg_dfs,axis=1)
            engine_rec['key']= key.key
#             engine_rec = engine_rec.iloc[0]
            eng_dfs.append(engine_rec)
        engine_rows= pd.concat(eng_dfs, axis=0)
        flght_engines.append(engine_rows)
        final= pd.concat(flght_engines, axis=0)
        clear_output(wait=True)
        print(i, ind, key, flush=True)

0 674 s3.ObjectSummary(bucket_name='iiaweb-s3-io-practice-bucket', key='652/1/652200107281436.gzip')


In [122]:
final

Unnamed: 0,EGT_mean,EGT_nunique,EGT_var,EGT_max,EGT_min,OIT_mean,OIT_nunique,OIT_var,OIT_max,OIT_min,...,ECYC_nunique,ECYC_var,ECYC_max,ECYC_min,FIRE_mean,FIRE_nunique,FIRE_var,FIRE_max,FIRE_min,key
7882,352.051622,109,35706.826469,590.5,73.5,71.047783,23,207.317076,84.114288,47.850586,...,2,0.069112,8936,8935,0,1,0,0,0,652/1/652200101120916.gzip
7833,375.882006,102,32614.366214,597.0,54.5,72.616710,22,149.445068,82.771179,49.193695,...,2,0.069112,5965,5964,0,1,0,0,0,652/1/652200101120916.gzip
8095,389.277286,107,20406.245370,589.0,80.0,74.966145,22,118.844637,82.771179,49.193695,...,2,0.069868,6714,6713,0,1,0,0,0,652/1/652200101120916.gzip
7876,439.772861,108,12829.884646,587.0,78.5,78.733957,21,65.624100,85.457397,50.536774,...,2,0.069112,7391,7390,0,1,0,0,0,652/1/652200101120916.gzip
7882,408.201266,91,20160.565271,591.0,86.5,78.882995,26,446.786911,89.486694,-390.000000,...,2,0.114283,8937,8936,0,1,0,0,0,652/1/652200101121118.gzip
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7876,408.731793,117,5002.607136,552.0,95.0,70.713381,24,1557.192280,78.741882,-390.000000,...,2,0.108941,7786,7785,0,1,0,0,0,652/1/652200107281220.gzip
7882,348.702736,100,33862.162661,574.5,77.0,68.565063,21,195.462394,81.428070,46.507477,...,2,0.061512,9327,9326,0,1,0,0,0,652/1/652200107281436.gzip
7833,375.654229,99,30913.408822,581.0,58.5,70.489507,22,141.828491,80.084991,47.850586,...,2,0.061512,6356,6355,0,1,0,0,0,652/1/652200107281436.gzip
8095,392.420398,103,21280.443772,573.0,62.5,73.523171,23,120.843225,82.771179,47.850586,...,2,0.061512,7110,7109,0,1,0,0,0,652/1/652200107281436.gzip
