In [28]:
import pandas as pd
import google.colab
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

### 0. Mounting the Google Drive to connect and load to Dataset

In [29]:
#PATH = pd.read_csv("C:\Users\Rufy\Repo\ca683_flight_delay_analytics\Data\Final\flight_total.csv")

# mount google drive
drive.mount('/content/drive', force_remount=True)

PATH = "/content/drive/MyDrive/flight_total.csv"
!ls {PATH}

In [30]:
data = pd.read_csv(PATH)

TypeError: argument of type 'method' is not iterable

In [31]:
#  display the first few rows of dataset and check for Nan values
print(data.head())
print(data.isnull().sum())
print(data.shape[0])
print(len(data.columns))

      FL_DATE OP_CARRIER ORIGIN_CITY_NAME  CRS_DEP_TIME  DEP_TIME  DEP_DELAY  \
0  01/01/2018         AA          Chicago           500     455.0       -5.0   
1  01/01/2018         AA          Chicago           515     509.0       -6.0   
2  01/01/2018         AA          Chicago           529     527.0       -2.0   
3  01/01/2018         AA          Chicago           630     625.0       -5.0   
4  01/01/2018         AA          Chicago           703     658.0       -5.0   

   FLIGHT_DELAY_LEVEL  
0                   0  
1                   0  
2                   0  
3                   0  
4                   0  
FL_DATE               0
OP_CARRIER            0
ORIGIN_CITY_NAME      0
CRS_DEP_TIME          0
DEP_TIME              0
DEP_DELAY             0
FLIGHT_DELAY_LEVEL    0
dtype: int64
3678859
7


In [32]:
#  drop rows with Nan values in the target variables
data.dropna(subset=["FLIGHT_DELAY_LEVEL"], inplace=True)

In [33]:
# prepare features and target variables
X = data[
    [
        "FL_DATE",
        "OP_CARRIER",
        "ORIGIN_CITY_NAME",
        "CRS_DEP_TIME",
        "DEP_TIME",
        "DEP_DELAY",
        "FLIGHT_DELAY_LEVEL",
    ]
]

y = data["FLIGHT_DELAY_LEVEL"]

In [34]:
#  convert categorical variables into numerical values
#X = pd.get_dummies(X)

data['FL_DATE'] = pd.to_datetime(data['FL_DATE'], format='%d/%m/%Y', errors='coerce').fillna(pd.to_datetime(data['FL_DATE'], format='%Y-%m-%d', errors='coerce'))

print(data['FL_DATE'])

0         2018-01-01
1         2018-01-01
2         2018-01-01
3         2018-01-01
4         2018-01-01
             ...    
3678854   2023-12-31
3678855   2023-12-31
3678856   2023-12-31
3678857   2023-12-31
3678858   2023-12-31
Name: FL_DATE, Length: 3678859, dtype: datetime64[ns]


In [35]:
# Convert the FL_DATE column to integer for usage
data['FL_DATE'] = data['FL_DATE'].dt.strftime('%Y%m%d').astype(int)

# Categorical variable encoding
df_encoded = pd.get_dummies(data[['OP_CARRIER', 'ORIGIN_CITY_NAME']])

### 1. Random Forest Modelling without parameter tuning

In [None]:
#  split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# create a Randomforestclassifer model
model = RandomForestClassifier(n_estimators=1000, random_state=0)

# train the model
model.fit(x_train, y_train)

# make prediction
predictions = model.predict(x_test)

#  evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy : {accuracy:.2f}")

#  get a classification report
print(classification_report(y_test, predictions))

In [None]:
# Model performance evaluation
y_pred = model.predict(x_test)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print out the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

### 2. Search for best parameter combinations

In [None]:
model = RandomForestClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'max_depth': [5, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_leaf_nodes': [100, 200, 300, 400, 500]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Train the model using GridSearchCV
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

### 3. Random Forest Modelling with parameter tuning

In [None]:
#  split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# create a Randomforestclassifer model
model = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=30, max_features="sqrt", min_samples_leaf=1, max_leaf_nodes=400, min_samples_split=2)

# train the model
model.fit(x_train, y_train)

# make prediction
predictions = model.predict(x_test)

#  evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy : {accuracy:.2f}")

#  get a classification report
print(classification_report(y_test, predictions))


In [38]:
# Model performance evaluation
y_pred = model.predict(x_test)

# Evaluating model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print out the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

MemoryError: Unable to allocate 24.0 GiB for an array with shape (2191, 2943087) and data type float32