In [34]:
import boto3
import awswrangler as wr

bucket = 'poc-2209-twayairport-dp'

s3 = boto3.client('s3')

In [35]:
# !pip install awswrangler

In [36]:
obj_list = s3.list_objects(Bucket = bucket, Prefix = 'train_data/coupon/raw_csv')
obj_list = [i['Key'] for i in obj_list['Contents'] if '.csv' in i['Key']]

import boto3
import pandas as pd
import awswrangler as wr

for key in obj_list:
    print(key)
    
    ## Coupon Data Lambda
    a, b, c, year, month, filename = key.split('/')
    path = f's3://{bucket}/{key}' ## Data path
    coupon_df = wr.s3.read_csv(path) ## Data Load
    
    ## Change Column Names
    coupon_df.rename(columns={
                    'flightno': 'flight_number',
                    'flightdate': 'flight_departure_date',
                    'issuedate': 'capture_date',
                    'carrier': 'airline_code',
                    'bookingclass': 'fare_class',
                    'cabinclass': 'cabin_code',
                  }, inplace=True)
    
    ## Meta Data Load
    class_df = pd.read_csv(f's3://{bucket}/meta_data/class.csv')
    
    ## Change Format
    coupon_df['flight_departure_date'] = coupon_df['flight_departure_date'].astype('str').str.replace('-','')
    coupon_df['flight_departure_date'] = pd.to_datetime(coupon_df['flight_departure_date'], format='%Y-%m-%d')
    coupon_df['capture_date'] = pd.to_datetime(coupon_df['capture_date'], format = '%Y%m%d')
    
    ## Data Filtering
    coupon_df = coupon_df[(coupon_df['fare_class'].isin(class_df[class_df['class'] == 'economic']['class_value']))
                          &(coupon_df['cabin_code'] == 'Y')].reset_index(drop = True)
    
    ## Save Data
    wr.s3.to_parquet(df = coupon_df,
                     path=f"s3://{bucket}/{'/'.join(key.split('/')[:2])}/pps_data/{year}/{month}/coupon_df.parquet")    

train_data/coupon/raw_csv/2016/12/coupon.csv
train_data/coupon/raw_csv/2017/01/coupon.csv
train_data/coupon/raw_csv/2017/02/coupon.csv
train_data/coupon/raw_csv/2017/03/coupon.csv
train_data/coupon/raw_csv/2017/04/coupon.csv
train_data/coupon/raw_csv/2017/05/coupon.csv
train_data/coupon/raw_csv/2017/06/coupon.csv
train_data/coupon/raw_csv/2017/07/coupon.csv
train_data/coupon/raw_csv/2017/08/coupon.csv
train_data/coupon/raw_csv/2017/11/coupon.csv
train_data/coupon/raw_csv/2017/12/coupon.csv
train_data/coupon/raw_csv/2018/01/coupon.csv
train_data/coupon/raw_csv/2018/02/coupon.csv
train_data/coupon/raw_csv/2018/03/coupon.csv
train_data/coupon/raw_csv/2018/04/coupon.csv
train_data/coupon/raw_csv/2018/05/coupon.csv
train_data/coupon/raw_csv/2018/06/coupon.csv
train_data/coupon/raw_csv/2018/07/coupon.csv
train_data/coupon/raw_csv/2018/08/coupon.csv
train_data/coupon/raw_csv/2021/08/coupon.csv
train_data/coupon/raw_csv/2021/09/coupon.csv
train_data/coupon/raw_csv/2021/10/coupon.csv
train_data

In [16]:
obj_list = s3.list_objects(Bucket = bucket, Prefix = 'train_data/booking/raw_csv')
obj_list = [i['Key'] for i in obj_list['Contents'] if '.csv' in i['Key']]

import boto3
import pandas as pd
import awswrangler as wr

for key in obj_list:
    print(key)
    
    ## Booking Data Lambda    
    a, b, c, year, month, filename = key.split('/')
    path = f's3://{bucket}/{key}' ## Data path
    booking_df = wr.s3.read_csv(path) ## Data Load
    
    ## Change Format
    booking_df['flight_departure_date'] = pd.to_datetime(booking_df['flight_departure_date'], format='%Y-%m-%d')
    booking_df['capture_date'] = pd.to_datetime(booking_df['capture_date'], format='%Y-%m-%d')
    
    ## Data Filtering
    booking_df = booking_df[(booking_df['cabin_code'] == 'Y')].reset_index(drop = True) ## 잔여좌석 계산때문에 economic fare_class는 나중에 제외

    ## Save Data
    wr.s3.to_parquet(df = booking_df,
                     path=f"s3://{bucket}/{'/'.join(key.split('/')[:2])}/pps_data/{year}/{month}/booking_df.parquet")            

train_data/booking/raw_csv/2016/11/booking.csv
train_data/booking/raw_csv/2016/12/booking.csv
train_data/booking/raw_csv/2017/01/booking.csv
train_data/booking/raw_csv/2017/02/booking.csv
train_data/booking/raw_csv/2017/03/booking.csv
train_data/booking/raw_csv/2017/04/booking.csv
train_data/booking/raw_csv/2017/05/booking.csv
train_data/booking/raw_csv/2017/06/booking.csv
train_data/booking/raw_csv/2017/07/booking.csv
train_data/booking/raw_csv/2017/08/booking.csv
train_data/booking/raw_csv/2017/09/booking.csv
train_data/booking/raw_csv/2017/11/booking.csv
train_data/booking/raw_csv/2017/12/booking.csv
train_data/booking/raw_csv/2018/01/booking.csv
train_data/booking/raw_csv/2018/02/booking.csv
train_data/booking/raw_csv/2018/03/booking.csv
train_data/booking/raw_csv/2018/04/booking.csv
train_data/booking/raw_csv/2018/05/booking.csv
train_data/booking/raw_csv/2018/06/booking.csv
train_data/booking/raw_csv/2018/07/booking.csv
train_data/booking/raw_csv/2018/08/booking.csv
train_data/bo

In [37]:
obj_list = s3.list_objects(Bucket = bucket, Prefix = 'train_data/flight/raw_csv')
obj_list = [i['Key'] for i in obj_list['Contents'] if '.csv' in i['Key']]

import boto3
import pandas as pd
import awswrangler as wr

for key in obj_list:
    print(key)
    
    ## Flight Data Lambda    
    a, b, c, year, month, day, filename = key.split('/')
    path = f's3://{bucket}/{key}' ## Data path
    flight_df = wr.s3.read_csv(path) ## Data Load
    
    ## Change Column Names
    flight_df.rename(columns={'cabin_class': 'cabin_code'}, inplace=True)
    
    ## Change Format
    flight_df['flight_departure_date'] = pd.to_datetime(flight_df['flight_departure_date'], format='%Y-%m-%d')
    flight_df['leg_departure_date_time'] = pd.to_datetime(flight_df['leg_departure_date_time'])
    flight_df['leg_arrival_date_time'] = pd.to_datetime(flight_df['leg_arrival_date_time'])
    
    ## Data Filtering
    flight_df = flight_df[flight_df['cabin_code'] == 'Y'].reset_index(drop = True)

    ## Save Data
    wr.s3.to_parquet(df = flight_df,
                     path=f"s3://{bucket}/{'/'.join(key.split('/')[:2])}/pps_data/{year}/{month}/{day}/flight_df.parquet")              

train_data/flight/raw_csv/2017/06/01/flight.csv
train_data/flight/raw_csv/2017/06/02/flight.csv
train_data/flight/raw_csv/2017/06/03/flight.csv
train_data/flight/raw_csv/2017/06/04/flight.csv
train_data/flight/raw_csv/2017/06/05/flight.csv
train_data/flight/raw_csv/2017/06/06/flight.csv
train_data/flight/raw_csv/2017/06/07/flight.csv
train_data/flight/raw_csv/2017/06/08/flight.csv
train_data/flight/raw_csv/2017/06/09/flight.csv
train_data/flight/raw_csv/2017/06/10/flight.csv
train_data/flight/raw_csv/2017/06/11/flight.csv
train_data/flight/raw_csv/2017/06/12/flight.csv
train_data/flight/raw_csv/2017/06/13/flight.csv
train_data/flight/raw_csv/2017/06/14/flight.csv
train_data/flight/raw_csv/2017/06/15/flight.csv
train_data/flight/raw_csv/2017/06/16/flight.csv
train_data/flight/raw_csv/2017/06/17/flight.csv
train_data/flight/raw_csv/2017/06/18/flight.csv
train_data/flight/raw_csv/2017/06/19/flight.csv
train_data/flight/raw_csv/2017/06/20/flight.csv
train_data/flight/raw_csv/2017/06/21/fli