In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Load dataset
data = pd.read_csv("../data/Jan_2020_ontime.csv")

# Basic data exploration
print("Dataset shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())

In [None]:
# Drop rows with missing values in key columns
data = data.dropna(subset=['DEP_TIME', 'OP_UNIQUE_CARRIER', 'DEP_DEL15'])

# Create departure hour feature
data['DEP_HOUR'] = data['DEP_TIME'].apply(lambda x: int(x/100))

# Select features
X = data[['DAY_OF_WEEK', 'DEP_HOUR', 'OP_UNIQUE_CARRIER']]
y = data['DEP_DEL15']

# Convert categorical features to dummy variables
X = pd.get_dummies(X, columns=['OP_UNIQUE_CARRIER'], drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save original features for analysis (before any preprocessing)
original_test_data = data.loc[X_test.index, ['OP_UNIQUE_CARRIER', 'DEP_HOUR']]

# Train Random Forest model
print("\nTraining Random Forest model...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
plt.title('Top 10 Feature Importances')
plt.tight_layout()
plt.show()

# Analyze performance by carrier
# Create results DataFrame with original carrier information
results_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Carrier': original_test_data['OP_UNIQUE_CARRIER'].values,
    'Hour': original_test_data['DEP_HOUR'].values
})

# Calculate accuracy by carrier
carrier_accuracy = results_df.groupby('Carrier').apply(
    lambda x: accuracy_score(x['Actual'], x['Predicted'])
).sort_values(ascending=False)

# Plot accuracy by carrier
plt.figure(figsize=(10, 6))
carrier_accuracy.plot(kind='bar')
plt.title('Model Accuracy by Carrier')
plt.xlabel('Carrier')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Analyze performance by hour
hour_accuracy = results_df.groupby('Hour').apply(
    lambda x: accuracy_score(x['Actual'], x['Predicted'])
)

# Plot accuracy by hour
plt.figure(figsize=(10, 6))
hour_accuracy.plot(kind='line', marker='o')
plt.title('Model Accuracy by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Accuracy')
plt.grid(True)
plt.tight_layout()
plt.show()

# Print insights
print("\nRandom Forest Model Insights:")
print(f"1. Overall model accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"2. Top 3 most important features: {feature_importance['Feature'].iloc[:3].tolist()}")
print(f"3. Best performing carrier: {carrier_accuracy.index[0]} with accuracy {carrier_accuracy.iloc[0]:.4f}")
print(f"4. Worst performing carrier: {carrier_accuracy.index[-1]} with accuracy {carrier_accuracy.iloc[-1]:.4f}")