# Random-forest & LSTM mixture approach


Author: Chen Zhixin  
Time: 2023/5/7

## 1. Data preprocessing

### 1.1 Import CSV file & Data cleaning

In the original data set, some blocks contains illegal values. Therefore, we need to clean them up.

In [None]:
import pandas as pd
data = pd.read_csv("drive/MyDrive/flights.csv",low_memory = False)

data = data[data['MONTH'] == 1]

# Convert the column "name" to a string type
data["ORIGIN_AIRPORT"] = data["ORIGIN_AIRPORT"].astype(str)

# Use the `apply()` method to check if each row in the column "name" is an integer
is_integer = data["ORIGIN_AIRPORT"].apply(lambda x: x.isdigit())

# Use the `drop()` method to delete the rows where the values in the selected columns are integers.
data = data.drop(data[is_integer].index)

# Modify the columns of the DataFrame
data_new = data.loc[:, ['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','SCHEDULED_DEPARTURE','ARRIVAL_DELAY']]



### 1.2 Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the LabelEncoder object to the column
le.fit(data_new["ORIGIN_AIRPORT"])

# Transform the column
data_new["ORIGIN_AIRPORT"] = le.transform(data_new["ORIGIN_AIRPORT"])
data_new["DESTINATION_AIRPORT"] = le.transform(data_new["DESTINATION_AIRPORT"])

print(data_new)

### 1.3 One-hot encoding


In [None]:
# Use the `get_dummies()` method to one-hot encode the column
data_new = pd.get_dummies(data_new, columns=['AIRLINE'])
print(data_new)

### 1.4 Check the quality of the testset

In [None]:
data_new = data_new.dropna(axis=0, how='any')

### 1.5 Norminazation (Optional)

In [None]:
# Normalize the 'a' column
keyword = 'MONTH'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))

keyword = 'DAY'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))

keyword = 'DAY_OF_WEEK'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))

keyword = 'ORIGIN_AIRPORT'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))

keyword = 'DESTINATION_AIRPORT'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))

keyword = 'SCHEDULED_DEPARTURE'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))

keyword = 'ARRIVAL_DELAY'
maximun = data_new[keyword].max()
minimun = data_new[keyword].min()
data_new[keyword] = data_new[keyword].apply(lambda x: (x - minimun) / (maximun - minimun))


print(data_new)

### 1.6 Weather API call & Merging
API source: https://api.weatherbit.io/

In [None]:
import requests

airport_lib = data_new['IATA'].unique()
airport_dict = dict()

for i in airport_lib:
  airport_dict[i] = []
  response = requests.get("https://api.weatherbit.io/v2.0/history/daily?station=%s&start_date=2015-01-01&end_date=2015-02-01&key=********" % i)
  if response.status_code == 200:
      response_json = response.json()
      for j in range(31):
        airport_dict[i].append(response_json['data'][j])
  else:
    print("API request failed")

data_new['DST_WIND_SPD'] = airport_dict['DESTINATION_AIRPORT'][data_new['DAY']]['wind_spd']
data_new['SRC_WIND_SPD'] = airport_dict['ORIGIN_AIRPORT'][data_new['DAY']]['wind_spd']
data_new['DST_WIND_SPD_MAX'] = airport_dict['DESTINATION_AIRPORT'][data_new['DAY']]['max_wind_spd']
data_new['SRC_WIND_SPD_MAX'] = airport_dict['ORIGIN_AIRPORT'][data_new['DAY']]['max_wind_spd']
data_new['DST_RH'] = airport_dict['DESTINATION_AIRPORT'][data_new['DAY']]['rh']
data_new['SRC_RH'] = airport_dict['ORIGIN_AIRPORT'][data_new['DAY']]['rh']
data_new['DST_CLOUD'] = airport_dict['DESTINATION_AIRPORT'][data_new['DAY']]['clouds']
data_new['SRC_CLOUD'] = airport_dict['ORIGIN_AIRPORT'][data_new['DAY']]['clouds']
data_new['DST_SNOW'] = airport_dict['DESTINATION_AIRPORT'][data_new['DAY']]['snow']
data_new['SRC_SNOW'] = airport_dict['ORIGIN_AIRPORT'][data_new['DAY']]['snow']

### 1.7 Calculation Congestion Degree

Definition:
$$
C_d =\frac{number\ of\ flights\ flying \ towards\ target}{max\ acceptance\ rate\ of\ the\ airport} 
$$

In [None]:

for i in range(31):
  for j in range(24):
    airport_d = dict()
    hourly = len(data[data['DAY'] == i and data['ACTUAL_DEPARTURE'] < j and data['ACTUAL_ARRIVAL'] > j])
    data[data['DAY'] == i and data['ACTUAL_DEPARTURE'] < j and data['ACTUAL_ARRIVAL'] > j] = hourly / data['MAX_CAPACITY']

## 2. Model training

### 2.1 LSTM prediction


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# fix random seed for reproducibility
tf.random.set_seed(7)

# Split the DataFrame into a training set and a test set
train_set, test_set = train_test_split(data_new, test_size=0.25)

# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
 dataX, dataY = [], []
 for i in range(len(dataset)-look_back-1):
  a = dataset[i:(i+look_back), 0]
  dataX.append(a)
  dataY.append(dataset[i + look_back, 0])
 return np.array(dataX), np.array(dataY)
look_back = 1
trainX, trainY = create_dataset(train_set, look_back)
testX, testY = create_dataset(test_set, look_back)
# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
# create and fit the LSTM network
model = Sequential()
model.add(LSTM(128, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
# make predictions
testPredict = model.predict(testX)

### 2.2 Random-forest based regression



In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Use the newly-generated Cd to fill-in the test set
test_set['CD'] = testPredict

# Create a random forest regression model
model = RandomForestRegressor(n_estimators = 100, max_depth=10)

# Fit the model to the training set
model.fit(train_set.drop("ARRIVAL_DELAY", axis=1), train_set["ARRIVAL_DELAY"])



In [None]:
# Make predictions on the test set
predictions = model.predict(test_set.drop("ARRIVAL_DELAY", axis=1))
# Evaluate the model's performance on the test set
score = model.score(test_set.drop("ARRIVAL_DELAY", axis=1), test_set["ARRIVAL_DELAY"])
actual = test_set["ARRIVAL_DELAY"].values
total = len(predictions)
true = 0
predictions.reshape(-1,1)
actual.reshape(-1,1)
for i in range(total):
  if abs(predictions[i] - actual[i]) < 30:
     true += 1
print("True cases:",true)
print("Total cases:",total)
print(true / total)

### 2.3. Random-forest based approach (classification task)

In [None]:
import pandas as pd
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [None]:
flights = pd.read_csv("drive/MyDrive/flights.csv",low_memory = False)

In [None]:
flights_needed_data = flights[0:100000]  # getting a segment 

In [None]:
flights_needed_data.info()  # for an insight into the data

In [None]:
flights_needed_data.value_counts('DIVERTED')  # will tell us the no. of flights which were diverted

In [None]:
sb.jointplot(data=flights_needed_data, x="SCHEDULED_ARRIVAL", y="ARRIVAL_TIME")

In [None]:
corr = flights_needed_data.corr(method='pearson')

In [None]:
sb.heatmap(corr)

In [None]:
corr

In [None]:
# filtering out unnecessary columns
flights_needed_data=flights_needed_data.drop(['YEAR','FLIGHT_NUMBER','AIRLINE','DISTANCE','TAIL_NUMBER','TAXI_OUT',
                                              'SCHEDULED_TIME','DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME',
                                              'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON'],
                                             axis=1)

In [None]:
# replacing all NaN values with the mean of the attribute in which they are present
flights_needed_data=flights_needed_data.fillna(flights_needed_data.mean())

In [None]:
# creating a new column; it will tell if the flight was delayed or not
result=[]

In [None]:
for row in flights_needed_data['ARRIVAL_DELAY']:
  if row > 15:
    result.append(1)
  else:
    result.append(0)  

In [None]:
flights_needed_data['result'] = result

In [None]:
flights_needed_data

In [None]:
flights_needed_data.value_counts('result')

In [None]:
# removing some more columns
flights_needed_data=flights_needed_data.drop(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
flights_needed_data

In [None]:
data = flights_needed_data.values
X, y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)  # splitting in the ratio 70:30

In [None]:
scaled_features = StandardScaler().fit_transform(X_train, X_test)

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [None]:
pred_prob = clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob[:,1])
auc_score