# Random Forest Regression

In [1]:
import pathlib

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Read Data

In [2]:
data_dir: pathlib.Path = pathlib.Path.cwd() / "datasets" / "processed"

In [3]:
train_df: pd.DataFrame = pd.read_parquet(data_dir / "train.parquet")
val_df: pd.DataFrame = pd.read_parquet(data_dir / "val.parquet")

## Debugging & development

Uncomment this for faster computations.

In [4]:
train_df = train_df.sample(frac=0.01, random_state=0)
val_df = val_df.sample(frac=0.01, random_state=0)

## Separate dependant and independant variables

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1155864 entries, 90502816 to 132549213
Data columns (total 22 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   Airline            1155864 non-null  int64 
 1   Origin             1155864 non-null  int64 
 2   Year               1155864 non-null  uint16
 3   Distance           1155864 non-null  uint16
 4   Cancelled          1155864 non-null  bool  
 5   DestState          1155864 non-null  int64 
 6   ActualElapsedTime  1155864 non-null  int16 
 7   OriginState        1155864 non-null  int64 
 8   ArrTime            1155864 non-null  uint16
 9   DestCityName       1155864 non-null  int64 
 10  CRSElapsedTime     1155864 non-null  int16 
 11  DepTime            1155864 non-null  uint16
 12  DepDelay           1155864 non-null  uint16
 13  Diverted           1155864 non-null  bool  
 14  CRSArrTime         1155864 non-null  uint16
 15  DayOfWeek          1155864 non-null  uin

In [6]:
x_train, y_train = train_df.drop(['ActualElapsedTime', 'ArrTime', 'DepTime', 'DepDelay', 'ArrDelay', 'Cancelled', 'Diverted'], axis=1), train_df['ArrDelay']
x_val, y_val = val_df.drop(['ActualElapsedTime', 'ArrTime', 'DepTime', 'DepDelay', 'ArrDelay', 'Cancelled', 'Diverted'], axis=1), val_df['ArrDelay']

## Fit forest

In [7]:
forest_regressor: RandomForestRegressor = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, criterion='squared_error', )

In [8]:
forest_regressor.fit(x_train, y_train)

## Evaluate on cross-validation data

In [9]:
pred_val = forest_regressor.predict(x_val)

In [10]:
# x_val_pred
y_val.reset_index(drop=True, inplace=True)
# y_val

eval_val: pd.DataFrame = pd.DataFrame([pred_val, y_val]).T
eval_val.columns = ['Predicted', 'Actual']

eval_val['Squarred Error'] = (eval_val['Predicted'] - eval_val['Actual'])**2
eval_val['Absolute Error'] = (eval_val['Predicted'] - eval_val['Actual']).abs()
mse = eval_val['Squarred Error'].mean()
mae = eval_val['Absolute Error'].mean()

mse, mae

(1068.137933692887, 16.456081688964556)

# NN

In [11]:
import tensorflow as tf

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Normalization(input_shape=[len(x_train.columns)]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])


Metal device set to: Apple M2 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2023-03-16 14:43:23.131821: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-16 14:43:23.131986: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [13]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error'])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, 15)               31        
 n)                                                              
                                                                 
 dense (Dense)               (None, 64)                1024      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,280
Trainable params: 5,249
Non-trainable params: 31
_____________________________________________________

In [15]:
model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Epoch 1/10


2023-03-16 14:43:23.414097: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-16 14:43:23.543253: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 

In [None]:
model.predict(x_val)

  79/4516 [..............................] - ETA: 8s

2023-03-16 14:37:16.797310: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




array([[ -36.234802],
       [ -69.969   ],
       [-131.96098 ],
       ...,
       [-223.31981 ],
       [-169.39867 ],
       [ -73.36511 ]], dtype=float32)

In [None]:
import sys
import tensorflow.keras
import pandas as pd
import sklearn as sk
import scipy as sp
import tensorflow as tf
import platform
print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print(f"SciPy {sp.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")
