In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.dataset as ds

from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier


# Loading the Data and observing

In [2]:
cols = [
    'Airline','Origin','Dest','CRSDepTime',
    'Month','DayOfWeek','Distance','DistanceGroup',
    'DepDel15','DepDelayMinutes',
    'ArrDel15','ArrDelayMinutes'
]

df = pd.read_parquet('./data/2021_delay_ds.parquet', columns=cols)

In [None]:
df.shape


(6311871, 12)

In [4]:
df.describe()

Unnamed: 0,CRSDepTime,Month,DayOfWeek,Distance,DistanceGroup,DepDel15,DepDelayMinutes,ArrDel15,ArrDelayMinutes
count,6311871.0,6311871.0,6311871.0,6311871.0,6311871.0,6203458.0,6203458.0,6185870.0,6185870.0
mean,1323.953,6.969905,4.013676,795.5762,3.653994,0.1731723,12.76132,0.1726611,12.52928
std,474.1972,3.300914,2.006264,583.2931,2.281563,0.3783962,47.36319,0.377954,46.7477
min,1.0,1.0,1.0,31.0,1.0,0.0,0.0,0.0,0.0
25%,920.0,4.0,2.0,370.0,2.0,0.0,0.0,0.0,0.0
50%,1315.0,7.0,4.0,646.0,3.0,0.0,0.0,0.0,0.0
75%,1723.0,10.0,6.0,1033.0,5.0,0.0,6.0,0.0,6.0
max,2359.0,12.0,7.0,5812.0,11.0,1.0,3095.0,1.0,3089.0


In [5]:
df.isnull().sum().sort_values(ascending=False).head(20)

ArrDelayMinutes    126001
ArrDel15           126001
DepDelayMinutes    108413
DepDel15           108413
CRSDepTime              0
Dest                    0
Origin                  0
Airline                 0
DistanceGroup           0
Distance                0
DayOfWeek               0
Month                   0
dtype: int64

# Tasks 
- 4 predictions 
    - Classification
        1. ArrivalDelay -> Binary classification 
            - Will the flight reach the destination late ?
        2. DepartureDelay -> Binary Classification
            - Will the flight take off late ? 
    - Regression
        1. ArrivalDelay
        2. DepartureDelay


## TASK 1 : DepartureDelay (Binary Classification)
### Does the flight takes off late ?

In [6]:
# Remove rows where the target departure delay 15 is missing (null) as label is missing 
df_departed = df[df['DepDel15'].notna()].copy()

In [7]:
df_departed.shape

(6203458, 12)

In [8]:
df_departed.columns

Index(['Airline', 'Origin', 'Dest', 'CRSDepTime', 'Month', 'DayOfWeek',
       'Distance', 'DistanceGroup', 'DepDel15', 'DepDelayMinutes', 'ArrDel15',
       'ArrDelayMinutes'],
      dtype='str')

In [37]:
df_departed.describe()

Unnamed: 0,CRSDepTime,Month,DayOfWeek,Distance,DistanceGroup,DepDel15,DepDelayMinutes,ArrDel15,ArrDelayMinutes
count,6203458.0,6203458.0,6203458.0,6203458.0,6203458.0,6203458.0,6203458.0,6185870.0,6185870.0
mean,1323.048,6.97204,4.014589,796.4053,3.657269,0.1731723,12.76132,0.1726611,12.52928
std,473.8326,3.297307,2.003863,584.4966,2.285825,0.3783962,47.36319,0.377954,46.7477
min,1.0,1.0,1.0,31.0,1.0,0.0,0.0,0.0,0.0
25%,920.0,4.0,2.0,370.0,2.0,0.0,0.0,0.0,0.0
50%,1315.0,7.0,4.0,646.0,3.0,0.0,0.0,0.0,0.0
75%,1721.0,10.0,6.0,1034.0,5.0,0.0,6.0,0.0,6.0
max,2359.0,12.0,7.0,5812.0,11.0,1.0,3095.0,1.0,3089.0


In [9]:
df_departed.info()

<class 'pandas.DataFrame'>
Index: 6203458 entries, 0 to 573778
Data columns (total 12 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Airline          str    
 1   Origin           str    
 2   Dest             str    
 3   CRSDepTime       int64  
 4   Month            int64  
 5   DayOfWeek        int64  
 6   Distance         float64
 7   DistanceGroup    int64  
 8   DepDel15         float64
 9   DepDelayMinutes  float64
 10  ArrDel15         float64
 11  ArrDelayMinutes  float64
dtypes: float64(5), int64(4), str(3)
memory usage: 770.1 MB


In [10]:
df_departed[df_departed['DepDel15']==1].head()

Unnamed: 0,Airline,Origin,Dest,CRSDepTime,Month,DayOfWeek,Distance,DistanceGroup,DepDel15,DepDelayMinutes,ArrDel15,ArrDelayMinutes
36,SkyWest Airlines Inc.,FSM,DFW,617,3,3,227.0,1,1.0,22.0,0.0,14.0
51,SkyWest Airlines Inc.,DFW,FLG,2045,3,3,853.0,4,1.0,148.0,1.0,105.0
59,SkyWest Airlines Inc.,DFW,ASE,840,3,3,701.0,3,1.0,18.0,0.0,4.0
62,SkyWest Airlines Inc.,PHX,IAH,2005,3,3,1009.0,5,1.0,35.0,0.0,1.0
66,SkyWest Airlines Inc.,PHX,SLC,1843,3,3,507.0,3,1.0,109.0,1.0,89.0


In [11]:
features_departed_classification = [
'Airline', 'Origin', 'Dest', 'CRSDepTime', 'Month', 'DayOfWeek',
       'Distance', 'DistanceGroup', 'DepDel15', 'DepDelayMinutes'
]

x = df_departed[features_departed_classification]
y = df_departed['DepDel15']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2,random_state=8, stratify=y)

In [13]:
categorical_cols = ['Airline','Origin','Dest']

numeric_cols = [
    'CRSDepTime',
    'Month',
    'DayOfWeek',
    'Distance',
    'DistanceGroup'
]


In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)


In [15]:
X_train_encoded = preprocessor.fit_transform(x_train)
X_test_encoded = preprocessor.transform(x_test)


In [21]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train_encoded, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [22]:
y_test_predicted = model.predict(X_test_encoded)


In [30]:
print(classification_report(y_test, y_test_predicted))

              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91   1025839
         1.0       0.56      0.02      0.04    214853

    accuracy                           0.83   1240692
   macro avg       0.70      0.51      0.47   1240692
weighted avg       0.78      0.83      0.76   1240692



In [None]:
print(classification_report(y_test, y_test_predicted))

print(confusion_matrix(y_test, y_test_predicted))

print(accuracy_score(y_test, y_test_predicted))



[[1022433    3406]
 [ 210459    4394]]
0.8276244224997018


#### Initial accuracy obtained : 0.8276244224997018
#### Observations :  
actualOnTime + predictedOntime   =   1022433 (TRUE POSITIVE)  
actualOnTime + predictedDelayed  =   3406    (FALSE NEGATIVE)  
actualDelayed + predictedOntime  =   210459  ***(FALSE POSITIVE)***  
actualDelayed + predictedDelayed =   4394     (TRUE NEGATIVE)  

***So the model is mostly trying to predict onTime as most of flights are not delayed so by predicting mostly not delayed it gives less accuracy easily***

##### Try to introduce class weighting so balanced importance to both

In [32]:
model_weight_balanced = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)


In [33]:
model_weight_balanced.fit(X_train_encoded, y_train)

y_test_predicted = model_weight_balanced.predict(X_test_encoded)



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
print(classification_report(y_test, y_test_predicted))

print(confusion_matrix(y_test, y_test_predicted))

print(accuracy_score(y_test, y_test_predicted))



              precision    recall  f1-score   support

         0.0       0.89      0.63      0.74   1025839
         1.0       0.26      0.63      0.37    214853

    accuracy                           0.63   1240692
   macro avg       0.58      0.63      0.56   1240692
weighted avg       0.78      0.63      0.68   1240692

[[647749 378090]
 [ 79028 135825]]
0.6315620637515194


#### Previously (Accuracy : 0.8276244224997018)
1022433    3406  
2,10,459    4394  
#### Now (Accuracy : 0.6315620637515194)
647749 3,78,090  
79,028 135825

- Now the True Negatives increased drastically which indicates the model is now properly predicting delays
- But the number of false negaives predicted as delayed but not delayed increased drastically this means model is trying to predict more things as delay
- Now the model has better recall (ability to identify the true things properly) 
- Previously recall was - 100%, 2% now 63% for both

### Random Forest Classifier


In [63]:
df_dep_small = df_departed.sample(200000, random_state=42)
df_dep_small.shape

(200000, 12)

In [43]:
dep_features = [
    'Airline','Origin','Dest','CRSDepTime',
    'Month','DayOfWeek','Distance','DistanceGroup'
]

X = df_dep_small[dep_features]
y = df_dep_small['DepDel15']


In [44]:

x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [46]:
X_train_encoded = preprocessor.fit_transform(x_train)
X_test_encoded = preprocessor.transform(x_test)


In [47]:
rf_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=12,
    n_jobs=-1,
    class_weight='balanced',
    random_state=42
)

rf_model.fit(X_train_encoded, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",50
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",12
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [51]:
random_forest_prediction = rf_model.predict(X_test_encoded)

In [52]:
print(classification_report(y_test, random_forest_prediction))

print(confusion_matrix(y_test, random_forest_prediction))

print(accuracy_score(y_test, random_forest_prediction))


              precision    recall  f1-score   support

         0.0       0.89      0.67      0.76     33090
         1.0       0.27      0.60      0.38      6910

    accuracy                           0.66     40000
   macro avg       0.58      0.63      0.57     40000
weighted avg       0.78      0.66      0.70     40000

[[22168 10922]
 [ 2786  4124]]
0.6573


In [53]:
importances = rf_model.feature_importances_

feature_names = preprocessor.get_feature_names_out()

feat_imp = pd.Series(importances, index=feature_names)
feat_imp = feat_imp.sort_values(ascending=False)

feat_imp.head(20)

num__CRSDepTime                        0.223193
num__Month                             0.168129
cat__Airline_Southwest Airlines Co.    0.148330
num__DayOfWeek                         0.039412
num__Distance                          0.033334
cat__Airline_Delta Air Lines Inc.      0.032562
cat__Airline_Endeavor Air Inc.         0.029148
num__DistanceGroup                     0.020768
cat__Airline_JetBlue Airways           0.017845
cat__Airline_Republic Airlines         0.016486
cat__Airline_Allegiant Air             0.015455
cat__Origin_DAL                        0.015091
cat__Origin_DEN                        0.010984
cat__Origin_BWI                        0.010888
cat__Origin_DFW                        0.007907
cat__Dest_ATL                          0.007275
cat__Airline_Hawaiian Airlines Inc.    0.007141
cat__Airline_SkyWest Airlines Inc.     0.006607
cat__Origin_HOU                        0.006233
cat__Origin_MDW                        0.005881
dtype: float64

⭐ Feature Engineering Opportunity

CRSDepTime is numeric like:

1345
2230
0815


Model treats it as raw number — not ideal.

Convert to Hour Bucket

In [89]:
df_dep_small = df_departed.sample(200000, random_state=42).copy()

df_dep_small.shape
df_dep_small["DepHour"] = df_dep_small["CRSDepTime"] // 100

df_dep_small["DepHour_sin"] = np.sin(2*np.pi*df_dep_small["DepHour"]/24)
df_dep_small["DepHour_cos"] = np.cos(2*np.pi*df_dep_small["DepHour"]/24)

categorical_cols = ['Airline','Origin','Dest']

# numeric_cols = [
#     'Month',
#     'DayOfWeek',
#     'Distance',
#     'DistanceGroup',
#     'DepHour'
# ]

numeric_cols = [
    'Month',
    'DayOfWeek',
    'Distance',
    'DistanceGroup',
    'DepHour_sin',
    'DepHour_cos'
]


dep_features = categorical_cols + numeric_cols

X = df_dep_small[dep_features]
y = df_dep_small['DepDel15']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



In [90]:

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)



In [91]:
rf_model1 = RandomForestClassifier(
    n_estimators=50,
    max_depth=12,
    n_jobs=-1,
    class_weight='balanced',
    random_state=42
)

rf_model1.fit(X_train_encoded, y_train)



0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",50
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",12
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [92]:
rf1_pred = rf_model1.predict(X_test_encoded)
print(classification_report(y_test, rf1_pred))
print(confusion_matrix(y_test, rf1_pred))
print(accuracy_score(y_test, rf1_pred))


              precision    recall  f1-score   support

         0.0       0.89      0.67      0.76     33090
         1.0       0.28      0.61      0.38      6910

    accuracy                           0.66     40000
   macro avg       0.58      0.64      0.57     40000
weighted avg       0.79      0.66      0.70     40000

[[22016 11074]
 [ 2674  4236]]
0.6563


### XG Boosted Trees

## TASK 2 : ArrivalDelay (Binary Classification) 
### Does the flight reaches destination late ?