# Cleaning / Preprocessing 

In [3]:
  # import modules 
import pandas as pd
import numpy as np
%matplotlib inline

In [4]:
df = pd.read_csv('../flight-delay-dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28820 entries, 0 to 28819
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MONTH              28820 non-null  int64  
 1   DAY_OF_MONTH       28820 non-null  int64  
 2   DAY_OF_WEEK        28820 non-null  int64  
 3   OP_UNIQUE_CARRIER  28820 non-null  object 
 4   TAIL_NUM           28820 non-null  object 
 5   DEST               28820 non-null  object 
 6   DEP_DELAY          28820 non-null  int64  
 7   CRS_ELAPSED_TIME   28820 non-null  int64  
 8   DISTANCE           28820 non-null  int64  
 9   CRS_DEP_M          28820 non-null  int64  
 10  DEP_TIME_M         28820 non-null  int64  
 11  CRS_ARR_M          28820 non-null  int64  
 12  Temperature        28820 non-null  int64  
 13  Dew Point          28820 non-null  object 
 14  Humidity           28820 non-null  int64  
 15  Wind               28818 non-null  object 
 16  Wind Speed         288

In [5]:
df.columns

Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM',
       'DEST', 'DEP_DELAY', 'CRS_ELAPSED_TIME', 'DISTANCE', 'CRS_DEP_M',
       'DEP_TIME_M', 'CRS_ARR_M', 'Temperature', 'Dew Point', 'Humidity',
       'Wind', 'Wind Speed', 'Wind Gust', 'Pressure', 'Condition', 'sch_dep',
       'sch_arr', 'TAXI_OUT'],
      dtype='object')

## Flight Delay Prediction Variables

#### Independent Variables (X)
- **Time-related:**
  - `MONTH`
  - `DAY_OF_MONTH` 
  - `DAY_OF_WEEK`
  - `CRS_DEP_M` (Scheduled departure time in minutes)
  - `CRS_ARR_M` (Scheduled arrival time in minutes)
  - `sch_dep` (Scheduled departure)

- **Flight-specific:**
  - `OP_UNIQUE_CARRIER` (Airline)
  - `TAIL_NUM` (Aircraft identifier)
  - `DEST` (Destination airport)
  - `CRS_ELAPSED_TIME` (Scheduled flight duration)
  - `DISTANCE` (Flight distance)

- **Weather conditions:**
  - `Temperature`
  - `Dew Point`
  - `Humidity`
  - `Wind Speed`
  - `Wind Gust`
  - `Pressure`
  - `Condition`

#### Dependent Variable (y)
- `DEP_DELAY`: Runway taxi-out time
  - **Binary classification target:**
    - 1 = Delayed (DEP_DELAY ≥ 15 minutes)
    - 0 = Not delayed (DEP_DELAY < 15 minutes)

In [6]:
X = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', #'TAIL_NUM', 
        'DEST', 'CRS_ELAPSED_TIME', 'DISTANCE', 'CRS_DEP_M', 'CRS_ARR_M','Temperature', 'Dew Point', 'Humidity', 'Wind Speed', 'Wind Gust', 'Pressure', 'Condition', 'sch_dep', 'DEP_DELAY']]
X.dtypes

MONTH                  int64
DAY_OF_MONTH           int64
DAY_OF_WEEK            int64
OP_UNIQUE_CARRIER     object
DEST                  object
CRS_ELAPSED_TIME       int64
DISTANCE               int64
CRS_DEP_M              int64
CRS_ARR_M              int64
Temperature            int64
Dew Point             object
Humidity               int64
Wind Speed             int64
Wind Gust              int64
Pressure             float64
Condition             object
sch_dep                int64
DEP_DELAY              int64
dtype: object

In [7]:
#Convert dew_point to int
X['Dew Point'] = X['Dew Point'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Dew Point'] = X['Dew Point'].astype(int)


In [8]:
X.isna().sum()
#TODO: Check for duplicated
X.duplicated().value_counts()

False    28820
Name: count, dtype: int64

#### Encode all non numeric-values

In [9]:
columns_to_encode = ['OP_UNIQUE_CARRIER', #'TAIL_NUM', 
                     'DEST', 'Condition']
X_encoded = pd.get_dummies(X, columns=columns_to_encode, drop_first=True)
X_encoded.to_csv("cleaned_flight_data_with_target.csv", index=False)