In [None]:
# importing the required modules for random forest regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [None]:
# importing the dataset
data = pd.read_csv('cleaned_flight_data.csv')
data

In [None]:
# declaring feature and target variables
X = data.drop('price_above_average', axis=1)
y = data['price_above_average']

In [None]:
# splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# checking the shape of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# creating a loop which changes the max_depth of the random forest classifier and displays the accuracy of the models up
for i in range(1, 16):
    rf_classifier = RandomForestClassifier(max_depth=i, random_state=0)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Max Depth:', i, 'Accuracy:', accuracy)

In [None]:
# instantiating the random forest classifier with a max_depth of 12 as it has the highest accuracy
rf_classifier = RandomForestClassifier(random_state=0, max_depth=12)
# fitting the model
rf_classifier.fit(X_train, y_train)
# predicting the target variable
y_pred = rf_classifier.predict(X_test)
# calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
# printing the accuracy of the model
print('Accuracy:', accuracy)

In [None]:
# finding the importance of each feature
feature_importance = rf_classifier.feature_importances_
# creating a dataframe to store the feature importance
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
# sorting the dataframe by importance
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
# printing the feature importance
print(feature_importance_df)

In [None]:
# visualizing the feature importance
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xticks(rotation=90)
plt.show()

In [None]:
# rebuilding the model without the least important feature 'day_of_week'
X = data.drop(['price_above_average', 'day_of_week_Monday', 'day_of_week_Tuesday', 'day_of_week_Wednesday', 'day_of_week_Thursday', 'day_of_week_Friday', 'day_of_week_Saturday', 'day_of_week_Sunday'], axis=1)
y = data['price_above_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
rf_classifier = RandomForestClassifier(max_depth=12, random_state=0)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
# this model is slightly less accurate than the previous model

In [None]:
# rebuilding the model with all the features
X = data.drop('price_above_average', axis=1)
y = data['price_above_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
rf_classifier = RandomForestClassifier(max_depth=12, random_state=0)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
# evaluating the model using a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# evaluating the model using a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
# plotting the confusion matrix
import seaborn as sns
plt.figure(figsize=(10, 5))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()