In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report, roc_auc_score, ConfusionMatrixDisplay, roc_curve, auc


In [2]:
df = pd.read_csv("clean_data.csv")
df.head()


Unnamed: 0,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,name,dep_sched_datetime,is_long_flight,is_delayed
0,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,United Air Lines Inc.,2013-01-01 05:15:00,False,True
1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,United Air Lines Inc.,2013-01-01 05:15:00,False,True
2,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,American Airlines Inc.,2013-01-01 05:15:00,False,True
3,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,JetBlue Airways,2013-01-01 05:15:00,False,False
4,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,Delta Air Lines Inc.,2013-01-01 06:15:00,False,False


### Подготовка к обучению моделей

In [3]:
X = df[['carrier', 'origin', 'dest', 'sched_dep_time', 'sched_arr_time', 'distance']]

# Perform one-hot encoding for categorical variables (carrier, origin, dest)
X = pd.get_dummies(X, columns=['carrier', 'origin', 'dest'], drop_first=False)

X

Unnamed: 0,sched_dep_time,sched_arr_time,distance,carrier_9E,carrier_AA,carrier_AS,carrier_B6,carrier_DL,carrier_EV,carrier_F9,...,dest_SNA,dest_SRQ,dest_STL,dest_STT,dest_SYR,dest_TPA,dest_TUL,dest_TVC,dest_TYS,dest_XNA
0,515,819,1400,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,529,830,1416,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,540,850,1089,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,545,1022,1576,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,600,837,762,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327299,2245,2351,209,False,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
327300,2250,7,301,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
327301,2246,1,264,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
327302,2255,2358,187,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, df.is_delayed, test_size=0.2, random_state=12)

### К-ближайших соседей (KNN)

In [5]:
model_KNN = KNeighborsClassifier()
KNN_params= {"n_neighbors": [2, 3, 4, 5]}
grid_search_KNN = GridSearchCV(model_KNN, KNN_params, cv=5, n_jobs=-1)
grid_search_KNN.fit(X_train, y_train)

#### Оценка качества

In [None]:
best_model_KNN = grid_search_KNN.best_estimator_
best_params_KNN = grid_search_KNN.best_params_
best_score_KNN = grid_search_KNN.best_score_
pred_KNN = best_model_KNN.predict(X_test)
cm_KNN = confusion_matrix(y_test, pred_KNN)
print(f"best params: {best_params_KNN}")
print(f"best score: {best_score_KNN}")
print(f"Train score: {best_model_KNN.score(X_train, y_train)}")
print(f"Test score: {best_model_KNN.score(X_test, y_test)}")
print(f"Confusion_matrix:\n{confusion_matrix(y_test, pred_KNN)}")
print(f"Precision, recall and f1-score:\n{classification_report(y_test, pred_KNN, target_names=['On-Time', 'Delayed'])}")
print(f"ROC-AUC score: {roc_auc_score(y_test, best_model_KNN.predict_proba(X_test)[:, 1])}")
print(f"CV_results: {pd.DataFrame(grid_search_KNN.cv_results_)}")

best params: {'n_neighbors': 4}
best score: 0.6128710683390759


In [None]:
fpr, tpr, thresholds = roc_curve(y_test, best_model_KNN.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
ConfusionMatrixDisplay(confusion_matrix=cm_KNN, display_labels=['On-Time', 'Delayed']).plot()

### Машина опорных векторов (SVM)

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100],
}

svm = LinearSVC(random_state=12) 
grid_search_SVM = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_SVM.fit(X_train, y_train)

In [None]:
best_model_SVM = grid_search_SVM.best_estimator_
best_params_SVM = grid_search_SVM.best_params_
best_score_SVM = grid_search_SVM.best_score_
pred_SVM = best_model_SVM.predict(X_test)
cm_SVM = confusion_matrix(y_test, pred_SVM)
print(f"best params: {best_params_SVM}")
print(f"best score: {best_score_SVM}")
print(f"Train score: {best_model_SVM.score(X_train, y_train)}")
print(f"Test score: {best_model_SVM.score(X_test, y_test)}")
print(f"Confusion_matrix:\n{confusion_matrix(y_test, pred_SVM)}")
print(f"Precision, recall and f1-score:\n{classification_report(y_test, pred_SVM, target_names=['On-Time', 'Delayed'])}")
print(f"ROC-AUC score: {roc_auc_score(y_test, best_model_SVM.decision_function(X_test))}")
print(f"CV_results: {pd.DataFrame(grid_search_SVM.cv_results_)}")


In [None]:
fpr, tpr, thresholds = roc_curve(y_test, best_model_SVM.decision_function(X_test))
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (SVM)')
plt.legend(loc='lower right')
plt.show()


In [None]:
ConfusionMatrixDisplay(confusion_matrix=cm_SVM, display_labels=['On-Time', 'Delayed']).plot()

### Случайный лес (Random Forest)

In [None]:
rf = RandomForestClassifier(random_state=12)

param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [10, 20, None],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4],
}

grid_search_RF = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_RF.fit(X_train, y_train)

In [None]:
best_model_RF = grid_search_RF.best_estimator_
best_params_RF = grid_search_RF.best_params_
best_score_RF = grid_search_RF.best_score_
print(f"best params: {best_params_RF}")
print(f"best score: {best_score_RF}")
pred_RF = best_model_RF.predict(X_test)
cm_RF = confusion_matrix(y_test, pred_RF)
print(f"Confusion_matrix:\n{confusion_matrix(y_test, pred_RF)}")
print(f"Precision, recall, and f1-score:\n{classification_report(y_test, pred_RF, target_names=['On-Time', 'Delayed'])}")
print(f"ROC-AUC score: {roc_auc_score(y_test, best_model_RF.predict_proba(X_test)[:, 1])}")


In [None]:
y_pred_prob = best_model_RF.predict_proba(X_test)[:, 1]

# Compute the ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Visualize the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Extract feature importances
feature_importances = best_model_RF.feature_importances_

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Number of top features to visualize
top_n = 10

# Plot the top N important features
plt.figure(figsize=(12, 6))
plt.bar(feature_importance_df['Feature'][:top_n], feature_importance_df['Importance'][:top_n], color='skyblue')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.title(f'Top {top_n} Important Features')
plt.xticks(rotation=45)
plt.show()