In [None]:
# uncomment below to pip install the dependencies
# !python -m pip install pandas
# !python -m pip install scikit-learn


# optional dependency which is required for reading parquet files in pandas 
# uncomment to install...
#!python -m pip install pyarrow


# imports...
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score

In [None]:
# load the data into a pandas dataframe, drop the last column
data = pd.read_parquet('./data/flight_data.parquet', engine='pyarrow').iloc[:,:-1]

# drop ArrDelay (1-1 relation with target variable) and 'Carrier' - optionally we could instead encode 'Carrier' with ".astype('category').cat.codes"
data.drop('Carrier', axis=1, inplace=True)
data.drop('ArrDelay', axis=1, inplace=True)

# drop rows containing Na, print first 5 rows
data = data.dropna()
data.head()

In [None]:
# We only use 10k rows for training as it is enough for the example. 
# You don't want your output model to be too large, and in our case, reducing the training data also decreases the model size..
print(r"WARNING: Dropping most of the data of data")
data = data[:10000]
print(data.shape)

In [None]:
# split the data into predictors and target variable, and split the data. 
x, y = data.iloc[:,:-1], data.iloc[:,-1]
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [None]:
# create the classifier object, fit the classifier with the training data
clf = RF()
clf.fit(xtrain, ytrain)

In [None]:
# Sanity check: check the model accuracy on the test subset
y_pred_test = clf.predict(xtest)
print(accuracy_score(ytest, y_pred_test))

In [None]:
# save the model
joblib.dump(clf, "./model/model.joblib")