# Models

In [3]:
# Imports
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import train_test_split

In [4]:
# importe files random_subset
df_data_mini_mini = pd.read_csv("random_subset_100000.csv")

In [5]:
# Review of the first rows dataset
df_data_mini_mini.head(5)

Unnamed: 0,MONTH,DAY_OF_MONTH,AIRLINE_ID,FL_NUM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,...,ARR_TIME,ARR_DELAY,CANCELLED,AIR_TIME,DISTANCE,WEATHER_DELAY,CARRIER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,9,23,19393,2067,23.0,10821.0,1082104.0,BWI,"Baltimore, MD",MD,...,1841.0,-9.0,0.0,39.0,210.0,,,,,
1,3,24,20409,348,91.0,13796.0,1379604.0,OAK,"Oakland, CA",CA,...,1802.0,10.0,0.0,69.0,353.0,,,,,
2,5,16,19393,1570,22.0,11697.0,1169704.0,FLL,"Fort Lauderdale, FL",FL,...,1013.0,-12.0,0.0,155.0,1092.0,,,,,
3,10,14,20366,3836,74.0,12266.0,1226603.0,IAH,"Houston, TX",TX,...,1201.0,-14.0,0.0,25.0,74.0,,,,,
4,5,13,19805,364,41.0,12953.0,1295302.0,LGA,"New York, NY",NY,...,1802.0,-1.0,0.0,97.0,733.0,,,,,


## RandomForest Regressor and classifier

In [6]:
# Drop any rows with missing values
df_data_mini_mini.dropna(inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_data_mini_mini.drop('ARR_DELAY', axis=1), df_data_mini_mini['ARR_DELAY'], test_size=0.2, random_state=42)

# Define the column transformer for categorical and numerical features
categorical_features = ['AIRLINE_ID', 'ORIGIN', 'DEST', 'DEST_CITY_NAME', 'DEST_STATE_ABR', 'DEST_STATE_NM', 'CARRIER', 'UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_CITY_NAME', 'DEST_STATE_ABR.1', 'DEST_STATE_ABR.2']
numeric_features = ['MONTH','DAY_OF_MONTH', 'FL_NUM', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_STATE_FIPS', 'DEST_WAC', 'DISTANCE', 'DEP_DELAY', 'ORIGIN_AIRPORT_ID', 'ORIGIN_STATE_FIPS', 'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'CANCELLED', 'AIR_TIME', 'DISTANCE', 'WEATHER_DELAY', 'CARRIER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Define the classification pipeline
clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('clf', RandomForestClassifier())])

# Define the regression pipeline
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('reg', RandomForestRegressor())])

# Fit and evaluate the classification pipeline
clf_pipeline.fit(X_train, y_train > 0) # predicting if delay is greater than 0
clf_score = clf_pipeline.score(X_test, y_test > 0)
print(f'Classification Accuracy: {clf_score:.3f}')

# Fit and evaluate the regression pipeline
reg_pipeline.fit(X_train, y_train)
reg_score = reg_pipeline.score(X_test, y_test)
print(f'Regression R-Squared: {reg_score:.3f}')

ValueError: Found unknown categories ['ABE', 'BJI', 'FAI', 'HYS', 'CLL', 'DVL', 'CIU', 'LAR'] in column 1 during transform

In [8]:
# Find why struggle during OneHotEncoder
filtered_column1_pb = df_data_mini_mini.loc[df_data_mini_mini['DEST'].isin(['ABE', 'LAR', 'HYS', 'CIU', 'DVL', 'CLL', 'FAI', 'BJI'])]
#filtered_column1_pb

## Linear Regression

In [9]:
# Imports
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
# Define the categorical and numeric features
categorical_features = ['AIRLINE_ID', 'ORIGIN', 'DEST', 'DEST_CITY_NAME', 'DEST_STATE_ABR', 'DEST_STATE_NM',
                        'CARRIER', 'UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_CITY_NAME', 'DEST_STATE_ABR.1', 'DEST_STATE_ABR.2']
numeric_features = ['MONTH','DAY_OF_MONTH', 'FL_NUM', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID',
                    'DEST_STATE_FIPS', 'DEST_WAC', 'DISTANCE', 'DEP_DELAY', 'ORIGIN_AIRPORT_ID',
                    'ORIGIN_STATE_FIPS', 'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'CANCELLED', 'AIR_TIME',
                    'DISTANCE', 'WEATHER_DELAY', 'CARRIER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
                    'LATE_AIRCRAFT_DELAY']

# Create feature and target DataFrames
X = df_data_mini_mini.drop('ARR_DELAY', axis=1)
y = df_data_mini_mini['ARR_DELAY']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for preprocessing categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Create the linear regression pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict the ARR_DELAY on the testing data
y_pred = pipeline.predict(X_test)
y_pred

# Predict the ARR_DELAY on the testing data
y_pred = pipeline.predict(X_test)

# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate root mean squared error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

ValueError: Found unknown categories ['ABE', 'BJI', 'FAI', 'HYS', 'CLL', 'DVL', 'CIU', 'LAR'] in column 1 during transform

## Classification pycaret

In [None]:
# Import
from pycaret.classification import *

In [None]:
# the real model to read belong to Nais
baseline = pd.read_csv("random_subset_100000.csv")

In [None]:
baseline = baseline.drop(["ARR_DELAY","FL_NUM","ORIGIN_WAC","WHEELS_OFF","WHEELS_ON","DEP_DELAY","DISTANCE","CRS_ARR_TIME","DEP_TIME"], axis=1)

In [None]:
listt = baseline.columns.to_list()

In [None]:
baseline_20 = baseline.iloc[0:200000]

In [None]:
#listt

In [None]:
s = setup(baseline_20, target = 'IS_DELAYED', session_id = 123)

In [None]:
# functional API
best = compare_models(include=["et"])

In [None]:
# pip install "schemdraw<0.16"

In [None]:
# functional API
evaluate_model(best)

In [None]:
# functional API
evaluate_model(best)

## Regression pycaret

In [None]:
# import
from sklearn.preprocessing import OneHotEncoder

In [None]:
# the real model to read belong to Nais
baseline = pd.read_pickle("Baseline_nan.pkl")

In [None]:
baseline = baseline.drop(["IS_DELAYED","FL_NUM","ORIGIN_WAC","WHEELS_OFF","WHEELS_ON","DEP_DELAY"], axis=1)

In [None]:
baseline_20 = baseline.iloc[0:200000]

In [None]:
s = setup(baseline_20, target = 'ARR_DELAY', session_id = 12)

In [None]:
models()

In [None]:
# functional API
best = compare_models(include = ["lightgbm"])

In [None]:
# functional API
evaluate_model(best)

## Models save

In [None]:
# Save the model with h5
#clf_pipeline.save('model_class_cam.h5')