# Imports

In [1]:
import pandas as pd
import json
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load Data

In [2]:
df = pd.read_csv('delayed flights with region and weather.csv').drop(columns=['Unnamed: 0'])

In [3]:
# df.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
df.shape

(2681780, 21)

In [5]:
df.head(3)

Unnamed: 0,CRSDepTime,DayOfMonthDep,DepDelay,Distance,MonthDep,Origin,TaxiOut,YearDep,DayOfMonthArr,CRSDayOfMonthDep,...,CRSYearDep,CRSDepDateTime,Lat,Long,Region,weather_code,precipitation_hours,snowfall_sum,wind_speed,WeatherCategory
0,1455,21,81.0,802.0,6,ORD,24.0,2006,21,21,...,2006,2006-06-21 14:55:00,41.9786,-87.9048,Illinois,55,5.0,0.0,25.2,safe_codes
1,750,8,0.0,2288.0,5,LAX,13.0,2007,8,8,...,2007,2007-05-08 07:50:00,33.942501,-118.407997,California,0,0.0,0.0,27.4,safe_codes
2,715,16,-7.0,677.0,3,HOU,6.0,2007,16,16,...,2007,2007-03-16 07:15:00,29.645399,-95.2789,Texas,2,0.0,0.0,23.3,safe_codes


In [6]:
df_cleaned = df.drop(columns=[
    'CRSDepTime',
    'DayOfMonthArr',
    'DayOfMonthDep',
    'MonthDep',
    'YearDep',
    'CRSDayOfMonthDep',
    'CRSDepDateTime',
    'Lat',
    'Long',
    'weather_code'
])

In [7]:
df_cleaned.head(3)

Unnamed: 0,DepDelay,Distance,Origin,TaxiOut,CRSMonthDep,CRSYearDep,Region,precipitation_hours,snowfall_sum,wind_speed,WeatherCategory
0,81.0,802.0,ORD,24.0,6,2006,Illinois,5.0,0.0,25.2,safe_codes
1,0.0,2288.0,LAX,13.0,5,2007,California,0.0,0.0,27.4,safe_codes
2,-7.0,677.0,HOU,6.0,3,2007,Texas,0.0,0.0,23.3,safe_codes


# Data Processing

## Encoding

In [8]:
df_cleaned['CRSMonthDep'] = df_cleaned['CRSMonthDep'].astype(str)

df_cleaned['CRSYearDep'] = df_cleaned['CRSYearDep'].astype(str)

In [9]:
df_encoded = pd.get_dummies(df_cleaned, columns = ['Origin', 'Region', 'CRSMonthDep', 'CRSYearDep', 'WeatherCategory'])
df_encoded

Unnamed: 0,DepDelay,Distance,TaxiOut,precipitation_hours,snowfall_sum,wind_speed,Origin_ABE,Origin_ABI,Origin_ABQ,Origin_ABY,...,CRSMonthDep_8,CRSMonthDep_9,CRSYearDep_2003,CRSYearDep_2004,CRSYearDep_2005,CRSYearDep_2006,CRSYearDep_2007,CRSYearDep_2008,WeatherCategory_dangerous_codes,WeatherCategory_safe_codes
0,81.0,802.0,24.0,5.0,0.0,25.2,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
1,0.0,2288.0,13.0,0.0,0.0,27.4,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
2,-7.0,677.0,6.0,0.0,0.0,23.3,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,-1.0,872.0,11.0,0.0,0.0,20.1,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
4,-4.0,745.0,10.0,2.0,0.0,10.8,False,False,False,False,...,True,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2681775,7.0,717.0,13.0,7.0,0.0,19.9,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
2681776,8.0,1587.0,13.0,0.0,0.0,24.0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
2681777,-4.0,1235.0,10.0,23.0,0.0,12.5,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2681778,-7.0,804.0,10.0,0.0,0.0,17.5,False,False,False,False,...,True,False,False,False,False,True,False,False,False,True


## Scaling

In [10]:
X = df_encoded.drop(columns=['DepDelay'])
y = df_encoded['DepDelay']

In [11]:
scalerX = StandardScaler().fit(X[['Distance', 'TaxiOut', 'precipitation_hours', 'snowfall_sum', 'wind_speed']])
                               
X[['Distance', 'TaxiOut', 'precipitation_hours', 'snowfall_sum', 'wind_speed']] = scalerX.transform(X[['Distance', 'TaxiOut', 'precipitation_hours', 'snowfall_sum', 'wind_speed']])
                               

In [12]:
y = y.array.reshape(-1, 1)
scalery = StandardScaler().fit(y)
y = scalery.transform(y)

In [13]:


# df_cleaned[['DepDelay', 'Distance', 'TaxiOut', 'precipitation_hours', 'snowfall_sum', 'wind_speed']] = scaler.fit_transform(df_cleaned[['DepDelay', 'Distance', 'TaxiOut', 'precipitation_hours', 'snowfall_sum', 'wind_speed']])

# Model

## Train test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.2, random_state=42)

## Linear Regression ABORTED

In [15]:
# lr = LinearRegression()
# lr.fit(X_train, y_train)

In [16]:
# lr.score(X_test, y_test)

## Decision Tree Regressor ABORTED

In [17]:
# clf = DecisionTreeRegressor(
# #     max_depth=6,
#     random_state=400
# )

In [18]:
# clf = clf.fit(X_train, y_train)

In [19]:
# clf.score(X_test, y_test)

# Neural Network

In [20]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(400,)),
    Dense(1)
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                25664     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 25729 (100.50 KB)
Trainable params: 25729 (100.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
X_train = X_train.astype('float32')
# y = y.astype('float32')

In [23]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [24]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1aabaa71690>

In [26]:
X_test = X_test.astype('float32')

model.evaluate(X_test, y_test)



0.9088836908340454

In [27]:
model.save_weights(
)

{'loss': <tf.Tensor: shape=(), dtype=float32, numpy=0.9088837>}

https://stackoverflow.com/questions/38058774/scikit-learn-how-to-scale-back-the-y-predicted-result