In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler  

In [6]:
class preprocessing_ops():
    def __init__(self, df):
        self.df = df

    def drop_duplicates(self): 
        self.df.drop_duplicates(inplace=True) 
    
    # since WT01 and WT02 both indicate fogs (only difference is if it's heavy or not), we can combine them into one column
    def add_fog(self):
        self.df['WT_FOG'] = np.where((self.df['WT01'] == 1) | (self.df['WT02'] == 1), 1, 0)
        self.df.drop(columns=['WT01','WT02'],inplace=True)

    def remove_cancelled(self):
        self.df = self.df[self.df.CANCELLED != 1]
        self.df.drop(columns=['CANCELLED','CANCELLATION_CODE'],inplace = True)

    def remove_missing_values(self):
        self.df = self.df.loc[:, self.df.isnull().mean() < .9]
    
    # drop IDs, similar columns, post-flight available columns, na values above 35%
    def drop_columns(self):
        self.df.drop(columns=['OP_CARRIER_FL_NUM','Unnamed: 32', 'ORIGIN_AIRPORT_ID','DEST_AIRPORT_ID','AIRLINE_ID','OP_UNIQUE_CARRIER','STATION','MANUFACTURE_YEAR'
                         , 'ORIGIN_CITY_NAME','ORIGIN_CITY_NAME_x','ORIGIN_CITY_NAME_y','NAME_x','NAME_y','DEST_CITY_NAME','DEPARTING_AIRPORT','DISPLAY_AIRPORT_NAME','DEST_CITY_NAME'
                         , 'DATE', 'ACTUAL_ELAPSED_TIME','DEP_TIME', 'DEP_TIME_BLK', 'DISTANCE_GROUP', 'DEP_DELAY_NEW', 'ARR_TIME','ARR_DELAY_NEW','ARR_TIME_BLK'
                         , 'CARRIER_DELAY','WEATHER_DELAY','NAS_DELAY','SECURITY_DELAY','LATE_AIRCRAFT_DELAY', 'WESD','PSUN','TSUN','SN32','SX32','TOBS','WT11','PGTM','SNWD','SNOW', 'WSF5'
                         ] ,inplace = True)
        
    def drop_similar_columns(self):
        self.df.drop(columns=['TMAX', 'TMIN', 'CRS_ARR_TIME', 'WDF5', 'DEL_BLOCK_COUNT','DISTANCE'
                         ] ,inplace = True)
    
    # WT has 1 unique value (1 if true and nan if not so filling nans with 0s). then deal with high percentage 0 columns
    def fill_weather_codes(self):
        self.df.fillna({'WT03':'0', 'WT04':'0', 'WT05':'0', 'WT06':'0', 'WT07':'0', 'WT08':'0', 'WT09':'0', 'WT10':'0'}, inplace=True)
        self.df.drop(columns=['WT10','WT07','WT05','WT09','WT04','WT06'],inplace=True)
        self.df['WT03'] = self.df['WT03'].astype(float)
        self.df['WT08'] = self.df['WT08'].astype(float)

    # preserve columns values with more than 2% percentual of delayed flights from the ORIGIN column and replace all other values with 'OTHER'
    def encode_origin_destination(self):
        self.df['ORIGIN'] = np.where(self.df['ORIGIN'].isin(self.df['ORIGIN'].value_counts(normalize=True).loc[lambda x: x > 0.02].index), self.df['ORIGIN'], 'OTHER')
        self.df['DEST'] = np.where(self.df['DEST'].isin(self.df['DEST'].value_counts(normalize=True).loc[lambda x: x > 0.02].index), self.df['DEST'], 'OTHER')

    # delay counts per departure time block and origin
    def delay_blocks_origin(self):
        self.df['DEL_BLOCK_COUNT'] = self.df.groupby('DEP_TIME_BLK')['DEP_DEL15'].transform('sum')
        self.df['DEL_ORIGIN_COUNT'] = self.df.groupby('ORIGIN')['DEP_DEL15'].transform('sum')

    # ‘any’ : If any NA values are present, drop that row or column.
    def drop_na(self):
        self.df.dropna(subset=['TAVG','NUMBER_OF_SEATS','WDF5','WSF5','PRCP','TMIN','TMAX','AWND','WSF2','WDF2'], how='any',inplace=True)

    def encode_string_cols(self):
        string_cols = self.df.select_dtypes(include=['object']).columns
        for col in string_cols:
            label_encoder = LabelEncoder()
            self.df[col] = label_encoder.fit_transform(self.df[col])

    def scale_num_cols(self):
        num_cols = self.df.select_dtypes(include=['float64']).columns
        for col in num_cols:
            scaler = StandardScaler()
            self.df[col] = scaler.fit_transform(self.df[col].values.reshape(-1, 1))

    def categorize(self):
        self.df['MONTH'] = self.df['MONTH'].astype('object')
        self.df['DAY_OF_MONTH'] = self.df['DAY_OF_MONTH'].astype('object')
        self.df['DAY_OF_WEEK'] = self.df['DAY_OF_WEEK'].astype('object')
        self.df['WT_FOG'] = self.df['WT_FOG'].astype('object')

    def get_df(self):
        return self.df   

In [7]:
# read
df = pd.read_csv('data/5guys_flight_data.csv')

# instantiate
preprocessing = preprocessing_ops(df)

# call methods
preprocessing.drop_duplicates()
preprocessing.delay_blocks_origin()
preprocessing.drop_na()
preprocessing.drop_columns()
preprocessing.drop_similar_columns()
preprocessing.remove_cancelled()
preprocessing.add_fog()
preprocessing.fill_weather_codes()
preprocessing.categorize()
preprocessing.scale_num_cols()
preprocessing.encode_origin_destination()
preprocessing.encode_string_cols()

pre_df = preprocessing.get_df()

  df = pd.read_csv('data/5guys_flight_data.csv')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.drop(columns=['CANCELLED','CANCELLATION_CODE'],inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['WT_FOG'] = np.where((self.df['WT01'] == 1) | (self.df['WT02'] == 1), 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.drop(columns=['WT01','WT02'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFram

In [8]:
pre_df.to_csv('data/5guys_flight_data_preprocessed.csv', index=False)