# üìä Analiza ML - Przewidywanie op√≥≈∫nie≈Ñ lot√≥w

## üöÄ Szybki start

### Wymagania:
```bash
pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm imbalanced-learn kagglehub joblib
```

### Automatyczne pobieranie danych:
- Notebook automatycznie pobierze dane z Kaggle przy pierwszym uruchomieniu
- Wymagane: konto Kaggle i token API ([instrukcja](https://github.com/Kaggle/kagglehub))
- Dataset: [US Flight Delays](https://www.kaggle.com/datasets/usdot/flight-delays)

### Alternatywnie - pobierz dane rƒôcznie:
```python
import kagglehub
kagglehub.dataset_download("usdot/flight-delays")
```

---

In [ ]:
# Automatyczne pobieranie danych - uruchom tƒô kom√≥rkƒô najpierw!
import os
import sys

# Sprawd≈∫ czy mamy kagglehub
try:
    import kagglehub
except ImportError:
    print("Instalujƒô kagglehub...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "kagglehub"])
    import kagglehub

# Pr√≥buj znale≈∫ƒá dane lokalnie najpierw
possible_paths = [
    'data',  # lokalny folder
    '../data',  # folder wy≈ºej
    os.path.join(os.getcwd(), 'data'),
]

DATASET_PATH = None

# Sprawd≈∫ lokalne foldery
for path in possible_paths:
    if os.path.exists(path) and os.path.exists(os.path.join(path, 'flights.csv')):
        DATASET_PATH = path
        print(f"‚úì Znaleziono dane lokalnie w: {DATASET_PATH}")
        break

# Je≈õli nie znaleziono lokalnie, pobierz z Kaggle
if DATASET_PATH is None:
    print("üì• Pobieram dane z Kaggle (to mo≈ºe chwilƒô potrwaƒá za pierwszym razem)...")
    try:
        DATASET_PATH = kagglehub.dataset_download("usdot/flight-delays")
        print(f"‚úì Dane pobrane do: {DATASET_PATH}")
    except Exception as e:
        print(f"‚ùå B≈ÇƒÖd pobierania: {e}")
        print("\nüîß RozwiƒÖzania:")
        print("1. Upewnij siƒô, ≈ºe masz konto Kaggle i skonfigurowany token API")
        print("   - Zaloguj siƒô na https://www.kaggle.com")
        print("   - Id≈∫ do Account -> Create New API Token")
        print("   - Zapisz plik kaggle.json w ~/.kaggle/ (Linux/Mac) lub C:\\Users\\[username]\\.kaggle\\ (Windows)")
        print("\n2. Lub pobierz dane rƒôcznie:")
        print("   - https://www.kaggle.com/datasets/usdot/flight-delays")
        print("   - Rozpakuj do folderu 'data' obok tego notebooka")
        raise

# Sprawd≈∫ czy pliki istniejƒÖ
required_files = ['flights.csv', 'airlines.csv', 'airports.csv']
missing_files = []
for file in required_files:
    if not os.path.exists(os.path.join(DATASET_PATH, file)):
        missing_files.append(file)

if missing_files:
    print(f"‚ùå Brakuje plik√≥w: {missing_files}")
    raise FileNotFoundError(f"Nie znaleziono wymaganych plik√≥w: {missing_files}")
else:
    print("‚úÖ Wszystkie pliki danych sƒÖ dostƒôpne!")
    print(f"üìÅ Lokalizacja: {os.path.abspath(DATASET_PATH)}")

import kagglehub

# Download dataset and get the path dynamically
dataset_path = kagglehub.dataset_download("usdot/flight-delays")
DATASET_PATH = dataset_path

print("Path to dataset files:", DATASET_PATH)

In [ ]:
# Import bibliotek
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, recall_score, 
    precision_score, confusion_matrix, classification_report, roc_curve
)
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
import warnings
import time
warnings.filterwarnings('ignore')

# Konfiguracja
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("Biblioteki za≈Çadowane pomy≈õlnie!")

# Wczytanie danych (u≈ºywamy DATASET_PATH z poprzedniej kom√≥rki)
print("\nüìä Wczytywanie danych...")
try:
    flights = pd.read_csv(os.path.join(DATASET_PATH, 'flights.csv'), nrows=500000)
    airlines = pd.read_csv(os.path.join(DATASET_PATH, 'airlines.csv'))
    airports = pd.read_csv(os.path.join(DATASET_PATH, 'airports.csv'))
    
    print(f"‚úì Wczytano {len(flights):,} lot√≥w (sample)")
    print(f"‚úì Liczba linii lotniczych: {len(airlines)}")
    print(f"‚úì Liczba lotnisk: {len(airports)}")
    
    # Podstawowe informacje
    print("\nüìã Przyk≈Çadowe dane:")
    display(flights.head())
    
except Exception as e:
    print(f"‚ùå B≈ÇƒÖd wczytywania danych: {e}")
    print("Upewnij siƒô, ≈ºe uruchomi≈Çe≈õ pierwszƒÖ kom√≥rkƒô z pobieraniem danych!")
    raise

In [ ]:
# Wczytanie danych (u≈ºywamy sample dla szybko≈õci)
print("Wczytywanie danych...")
flights = pd.read_csv(os.path.join(DATASET_PATH, 'flights.csv'), nrows=500000)
airlines = pd.read_csv(os.path.join(DATASET_PATH, 'airlines.csv'))
airports = pd.read_csv(os.path.join(DATASET_PATH, 'airports.csv'))

print(f"‚úì Wczytano {len(flights):,} lot√≥w (sample)")
print(f"‚úì Liczba linii lotniczych: {len(airlines)}")
print(f"‚úì Liczba lotnisk: {len(airports)}")

# Podstawowe informacje
print("\nüìã Przyk≈Çadowe dane:")
flights.head()

## Przygotowanie danych podstawowych

In [None]:
# Podstawowe czyszczenie danych
df = flights.copy()

# Usuniƒôcie odwo≈Çanych lot√≥w
df = df[df['CANCELLED'] == 0]
print(f"Po usuniƒôciu odwo≈Çanych: {len(df)} lot√≥w")

# Usuniƒôcie brak√≥w w kluczowych kolumnach
key_columns = ['DEPARTURE_DELAY', 'AIRLINE', 'ORIGIN_AIRPORT', 
               'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DISTANCE']
df = df.dropna(subset=key_columns)
print(f"Po usuniƒôciu brak√≥w: {len(df)} lot√≥w")

# Utworzenie zmiennej docelowej
df['DELAYED'] = (df['DEPARTURE_DELAY'] > 15).astype(int)
print(f"\nProcent op√≥≈∫nionych lot√≥w: {df['DELAYED'].mean()*100:.2f}%")

# Wizualizacja rozk≈Çadu op√≥≈∫nie≈Ñ
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
delays_for_plot = df['DEPARTURE_DELAY'][(df['DEPARTURE_DELAY'] >= -30) & (df['DEPARTURE_DELAY'] <= 120)]
plt.hist(delays_for_plot, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(x=15, color='red', linestyle='--', label='Pr√≥g 15 min')
plt.title('Rozk≈Çad op√≥≈∫nie≈Ñ (-30 do 120 min)')
plt.xlabel('Op√≥≈∫nienie (minuty)')
plt.ylabel('Liczba lot√≥w')
plt.legend()

plt.subplot(1, 3, 2)
extreme_delays = df[df['DEPARTURE_DELAY'] > 300]
plt.hist(extreme_delays['DEPARTURE_DELAY'], bins=30, edgecolor='black', alpha=0.7, color='orange')
plt.title(f'Ekstremalne op√≥≈∫nienia (>300 min)\nn={len(extreme_delays)}')
plt.xlabel('Op√≥≈∫nienie (minuty)')
plt.ylabel('Liczba lot√≥w')

plt.subplot(1, 3, 3)
delay_counts = df['DELAYED'].value_counts()
plt.pie(delay_counts.values, labels=['Na czas (‚â§15 min)', 'Op√≥≈∫niony (>15 min)'], 
        autopct='%1.1f%%', startangle=90, colors=['lightgreen', 'salmon'])
plt.title('Balans klas')

plt.tight_layout()
plt.show()

print(f"\nMax op√≥≈∫nienie: {df['DEPARTURE_DELAY'].max():.0f} minut")
print(f"Op√≥≈∫nienia >300 min: {len(extreme_delays)} ({len(extreme_delays)/len(df)*100:.2f}%)")

# ETAP 1: Model Baseline (10% recall)

Prosty model z podstawowymi cechami - punkt startowy dla dalszych ulepsze≈Ñ.

In [None]:
print("="*50)
print("ETAP 1: MODEL BASELINE")
print("="*50)

# Kopia danych dla etapu 1
df_stage1 = df.copy()

# B≈ÅƒÑD 1: Usuwanie outlier√≥w (p√≥≈∫niej to naprawimy)
df_stage1 = df_stage1[(df_stage1['DEPARTURE_DELAY'] >= -30) & 
                      (df_stage1['DEPARTURE_DELAY'] <= 300)]

# Sample dla szybko≈õci
if len(df_stage1) > 100000:
    df_stage1 = df_stage1.sample(n=100000, random_state=42)

print(f"U≈ºywamy {len(df_stage1)} pr√≥bek")

# Podstawowy feature engineering (12 cech)
df_stage1['DEPARTURE_HOUR'] = df_stage1['SCHEDULED_DEPARTURE'].astype(str).str.zfill(4).str[:2].astype(int)

def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df_stage1['TIME_OF_DAY'] = df_stage1['DEPARTURE_HOUR'].apply(get_time_of_day)

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df_stage1['SEASON'] = df_stage1['MONTH'].apply(get_season)
df_stage1['IS_WEEKEND'] = (df_stage1['DAY_OF_WEEK'].isin([6, 7])).astype(int)
df_stage1['DISTANCE_CATEGORY'] = pd.cut(df_stage1['DISTANCE'], 
                                        bins=[0, 500, 1000, 2000, 5000], 
                                        labels=['Short', 'Medium', 'Long', 'Very_Long'])

# Cechy dla modelu (12 cech)
feature_columns_stage1 = [
    'MONTH', 'DAY', 'DAY_OF_WEEK', 'DEPARTURE_HOUR',
    'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
    'DISTANCE', 'IS_WEEKEND', 'TIME_OF_DAY', 'SEASON', 'DISTANCE_CATEGORY'
]

X_stage1 = df_stage1[feature_columns_stage1].copy()
y_stage1 = df_stage1['DELAYED']

# Label encoding
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 
                      'TIME_OF_DAY', 'SEASON', 'DISTANCE_CATEGORY']

for col in categorical_columns:
    le = LabelEncoder()
    X_stage1[col] = le.fit_transform(X_stage1[col].astype(str))

# Podzia≈Ç na zbiory
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X_stage1, y_stage1, test_size=0.2, random_state=42, stratify=y_stage1
)

print(f"\nCechy: {len(feature_columns_stage1)}")
print(f"Zbi√≥r treningowy: {len(X_train1)}, testowy: {len(X_test1)}")
print(f"Procent op√≥≈∫nie≈Ñ: {y_stage1.mean()*100:.2f}%")

In [None]:
# Trenowanie modeli baseline
print("\nTrenowanie modeli baseline...")

# Random Forest
rf_baseline = RandomForestClassifier(
    n_estimators=50,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

start = time.time()
rf_baseline.fit(X_train1, y_train1)
print(f"Random Forest - czas trenowania: {time.time()-start:.1f}s")

# XGBoost
xgb_baseline = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)

start = time.time()
xgb_baseline.fit(X_train1, y_train1)
print(f"XGBoost - czas trenowania: {time.time()-start:.1f}s")

# Predykcje
y_pred_rf1 = rf_baseline.predict(X_test1)
y_pred_xgb1 = xgb_baseline.predict(X_test1)

# Wyniki
print("\n=== WYNIKI ETAP 1 (BASELINE) ===")
print("\nRandom Forest:")
print(f"Recall: {recall_score(y_test1, y_pred_rf1)*100:.1f}%")
print(f"Precision: {precision_score(y_test1, y_pred_rf1)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test1, y_pred_rf1):.3f}")

print("\nXGBoost:")
print(f"Recall: {recall_score(y_test1, y_pred_xgb1)*100:.1f}%")
print(f"Precision: {precision_score(y_test1, y_pred_xgb1)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test1, y_pred_xgb1):.3f}")

# Confusion matrix
cm_rf1 = confusion_matrix(y_test1, y_pred_rf1)
cm_xgb1 = confusion_matrix(y_test1, y_pred_xgb1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(cm_rf1, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('Random Forest - Etap 1')
ax1.set_xlabel('Przewidywane')
ax1.set_ylabel('Rzeczywiste')

sns.heatmap(cm_xgb1, annot=True, fmt='d', cmap='Greens', ax=ax2)
ax2.set_title('XGBoost - Etap 1')
ax2.set_xlabel('Przewidywane')
ax2.set_ylabel('Rzeczywiste')

plt.tight_layout()
plt.show()

print("\n‚ö†Ô∏è PROBLEM: Bardzo niski recall (~10%) - model przewiduje g≈Ç√≥wnie loty na czas!")

# Trenowanie modeli baseline
print("\nTrenowanie modeli baseline...")

# Random Forest
rf_baseline = RandomForestClassifier(
    n_estimators=50,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

start = time.time()
rf_baseline.fit(X_train1, y_train1)
print(f"Random Forest - czas trenowania: {time.time()-start:.1f}s")

# XGBoost
xgb_baseline = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)

start = time.time()
xgb_baseline.fit(X_train1, y_train1)
print(f"XGBoost - czas trenowania: {time.time()-start:.1f}s")

# Predykcje
y_pred_rf1 = rf_baseline.predict(X_test1)
y_pred_xgb1 = xgb_baseline.predict(X_test1)

# Wyniki
print("\n=== WYNIKI ETAP 1 (BASELINE) ===")
print("\nRandom Forest:")
print(f"Recall: {recall_score(y_test1, y_pred_rf1)*100:.1f}%")
print(f"Precision: {precision_score(y_test1, y_pred_rf1)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test1, y_pred_rf1):.3f}")

print("\nXGBoost:")
print(f"Recall: {recall_score(y_test1, y_pred_xgb1)*100:.1f}%")
print(f"Precision: {precision_score(y_test1, y_pred_xgb1)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test1, y_pred_xgb1):.3f}")

# Confusion matrix
cm_rf1 = confusion_matrix(y_test1, y_pred_rf1)
cm_xgb1 = confusion_matrix(y_test1, y_pred_xgb1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(cm_rf1, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('Random Forest - Etap 1')
ax1.set_xlabel('Przewidywane')
ax1.set_ylabel('Rzeczywiste')

sns.heatmap(cm_xgb1, annot=True, fmt='d', cmap='Greens', ax=ax2)
ax2.set_title('XGBoost - Etap 1')
ax2.set_xlabel('Przewidywane')
ax2.set_ylabel('Rzeczywiste')

plt.tight_layout()
plt.show()

# Feature importance dla modelu baseline
importance1 = pd.DataFrame({
    'feature': X_train1.columns,
    'importance': xgb_baseline.feature_importances_
}).sort_values('importance', ascending=False)

# Dodaj opisowe etykiety
importance1['label'] = importance1['feature'].apply(get_feature_label)

plt.figure(figsize=(12, 8))
top_features_baseline = importance1.head(12)  # Wszystkie 12 cech
plt.barh(range(len(top_features_baseline)), top_features_baseline['importance'])

# Ustaw opisowe etykiety na osi Y
plt.yticks(range(len(top_features_baseline)), top_features_baseline['label'])

plt.xlabel('Wa≈ºno≈õƒá cechy', fontsize=12)
plt.title('Wa≈ºno≈õƒá cech - Etap 1 (Baseline Model)', fontsize=14)
plt.gca().invert_yaxis()

# Dodaj warto≈õci na s≈Çupkach
for i, v in enumerate(top_features_baseline['importance']):
    plt.text(v + 0.002, i, f'{v:.3f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

print("\n‚ö†Ô∏è PROBLEM: Bardzo niski recall (~10%) - model przewiduje g≈Ç√≥wnie loty na czas!")
print("\nNajwa≈ºniejsze cechy w modelu baseline:")
for i, row in top_features_baseline.head(5).iterrows():
    print(f"{i+1}. {row['feature']}: {row['label']} (wa≈ºno≈õƒá: {row['importance']:.3f})")

In [ ]:
# Mapowanie nazw cech na opisowe etykiety
FEATURE_LABELS = {
    # Cechy czasowe
    'MONTH': 'MiesiƒÖc lotu',
    'DAY': 'Dzie≈Ñ miesiƒÖca',
    'DAY_OF_WEEK': 'Dzie≈Ñ tygodnia',
    'DEPARTURE_HOUR': 'Godzina odlotu',
    'DEPARTURE_MINUTE': 'Minuta odlotu',
    
    # Cechy lotnicze
    'AIRLINE': 'Linia lotnicza',
    'ORIGIN_AIRPORT': 'Lotnisko wylotu',
    'DESTINATION_AIRPORT': 'Lotnisko docelowe',
    'DISTANCE': 'Dystans lotu (mile)',
    'LOG_DISTANCE': 'Log(dystans)',
    
    # Cechy czasowe binarne
    'IS_WEEKEND': 'Czy weekend',
    'IS_FRIDAY': 'Czy piƒÖtek',
    'IS_MONDAY': 'Czy poniedzia≈Çek',
    'IS_RUSH_HOUR': 'Czy godziny szczytu (7-9, 17-19)',
    'IS_LATE_NIGHT': 'Czy p√≥≈∫na noc (22-5)',
    'IS_EARLY_MORNING': 'Czy wczesny ranek (4-6)',
    
    # Cechy cykliczne
    'HOUR_SIN': 'Godzina (sk≈Çadowa sin)',
    'HOUR_COS': 'Godzina (sk≈Çadowa cos)',
    'MONTH_SIN': 'MiesiƒÖc (sk≈Çadowa sin)',
    'MONTH_COS': 'MiesiƒÖc (sk≈Çadowa cos)',
    
    # Cechy sezonowe/≈õwiƒÖteczne
    'IS_HOLIDAY_SEASON': 'Czy okres ≈õwiƒÖteczny',
    'SEASON': 'Sezon roku',
    'TIME_OF_DAY': 'Pora dnia',
    
    # Cechy lotnisk/tras
    'ORIGIN_BUSY': 'Natƒô≈ºenie ruchu - lotnisko wylotu',
    'DEST_BUSY': 'Natƒô≈ºenie ruchu - lotnisko docelowe',
    'ORIGIN_CONGESTION': 'Zagƒôszczenie - lotnisko wylotu',
    'DEST_CONGESTION': 'Zagƒôszczenie - lotnisko docelowe',
    'ROUTE': 'Trasa lotu',
    'ROUTE_FREQ': 'Popularno≈õƒá trasy',
    'ROUTE_POPULARITY': 'Czƒôstotliwo≈õƒá trasy',
    
    # Cechy op√≥≈∫nie≈Ñ
    'AIRLINE_DELAY_RATE': 'Wska≈∫nik op√≥≈∫nie≈Ñ linii',
    'ORIGIN_DELAY_RATE': 'Wska≈∫nik op√≥≈∫nie≈Ñ lotniska wylotu',
    
    # Kategorie dystansu
    'DISTANCE_BIN': 'Kategoria dystansu',
    'DISTANCE_CATEGORY': 'Kategoria odleg≈Ço≈õci',
    
    # Cechy interakcyjne
    'RUSH_AIRLINE': 'Godziny szczytu √ó wska≈∫nik linii',
    'HOLIDAY_ORIGIN': '≈öwiƒôta √ó wska≈∫nik lotniska',
    'HOUR_AIRLINE': 'Godzina √ó wska≈∫nik linii',
    
    # Data leakage (b≈Çƒôdna cecha)
    'DELAY_LOG': 'üö® LOG(OP√ì≈πNIENIE) - DATA LEAKAGE!'
}

def get_feature_label(feature_name):
    """Zwraca opisowƒÖ etykietƒô dla cechy"""
    return FEATURE_LABELS.get(feature_name, feature_name)

print("Mapowanie cech utworzone - bƒôdzie u≈ºywane w wykresach")

In [ ]:
# Mapowanie nazw cech na opisowe etykiety
FEATURE_LABELS = {
    # Cechy czasowe
    'MONTH': 'MiesiƒÖc lotu',
    'DAY': 'Dzie≈Ñ miesiƒÖca',
    'DAY_OF_WEEK': 'Dzie≈Ñ tygodnia',
    'DEPARTURE_HOUR': 'Godzina odlotu',
    'DEPARTURE_MINUTE': 'Minuta odlotu',
    
    # Cechy lotnicze
    'AIRLINE': 'Linia lotnicza',
    'ORIGIN_AIRPORT': 'Lotnisko wylotu',
    'DESTINATION_AIRPORT': 'Lotnisko docelowe',
    'DISTANCE': 'Dystans lotu (mile)',
    'LOG_DISTANCE': 'Log(dystans)',
    
    # Cechy czasowe binarne
    'IS_WEEKEND': 'Czy weekend',
    'IS_FRIDAY': 'Czy piƒÖtek',
    'IS_MONDAY': 'Czy poniedzia≈Çek',
    'IS_RUSH_HOUR': 'Czy godziny szczytu (7-9, 17-19)',
    'IS_LATE_NIGHT': 'Czy p√≥≈∫na noc (22-5)',
    'IS_EARLY_MORNING': 'Czy wczesny ranek (4-6)',
    
    # Cechy cykliczne
    'HOUR_SIN': 'Godzina (sk≈Çadowa sin)',
    'HOUR_COS': 'Godzina (sk≈Çadowa cos)',
    'MONTH_SIN': 'MiesiƒÖc (sk≈Çadowa sin)',
    'MONTH_COS': 'MiesiƒÖc (sk≈Çadowa cos)',
    
    # Cechy sezonowe/≈õwiƒÖteczne
    'IS_HOLIDAY_SEASON': 'Czy okres ≈õwiƒÖteczny',
    'SEASON': 'Sezon roku',
    'TIME_OF_DAY': 'Pora dnia',
    
    # Cechy lotnisk/tras
    'ORIGIN_BUSY': 'Natƒô≈ºenie ruchu - lotnisko wylotu',
    'DEST_BUSY': 'Natƒô≈ºenie ruchu - lotnisko docelowe',
    'ORIGIN_CONGESTION': 'Zagƒôszczenie - lotnisko wylotu',
    'DEST_CONGESTION': 'Zagƒôszczenie - lotnisko docelowe',
    'ROUTE': 'Trasa lotu',
    'ROUTE_FREQ': 'Popularno≈õƒá trasy',
    'ROUTE_POPULARITY': 'Czƒôstotliwo≈õƒá trasy',
    
    # Cechy op√≥≈∫nie≈Ñ
    'AIRLINE_DELAY_RATE': 'Procent op√≥≈∫nie≈Ñ danej linii',
    'ORIGIN_DELAY_RATE': 'Procent op√≥≈∫nie≈Ñ lotniska wylotu',
    
    # Kategorie dystansu
    'DISTANCE_BIN': 'Kategoria dystansu',
    'DISTANCE_CATEGORY': 'Kategoria odleg≈Ço≈õci',
    
    # Cechy interakcyjne
    'RUSH_AIRLINE': 'Ryzyko: godziny szczytu √ó op√≥≈∫nienia linii',
    'HOLIDAY_ORIGIN': 'Ryzyko: ≈õwiƒôta √ó op√≥≈∫nienia lotniska',
    'HOUR_AIRLINE': 'Ryzyko: godzina √ó op√≥≈∫nienia linii',
    
    # Data leakage (b≈Çƒôdna cecha)
    'DELAY_LOG': 'üö® Logarytm op√≥≈∫nienia - wyciek danych'
}

def get_feature_label(feature_name):
    """Zwraca opisowƒÖ etykietƒô dla cechy"""
    return FEATURE_LABELS.get(feature_name, feature_name)

print("Mapowanie cech utworzone - bƒôdzie u≈ºywane w wykresach")

In [ ]:
# Label encoding
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE_BIN']
for col in categorical_columns:
    le = LabelEncoder()
    X_stage2[col] = le.fit_transform(X_stage2[col].astype(str))

# Podzia≈Ç
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_stage2, y_stage2, test_size=0.2, random_state=42, stratify=y_stage2
)

# SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.6)
X_train2_smote, y_train2_smote = smote.fit_resample(X_train2, y_train2)

# Trenowanie XGBoost
print("\nTrenowanie modelu z data leakage...")
xgb_leakage = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42,
    n_jobs=-1
)

start = time.time()
xgb_leakage.fit(X_train2_smote, y_train2_smote)
print(f"Czas trenowania: {time.time()-start:.1f}s")

# Predykcje z optymalizacjƒÖ threshold
y_proba2 = xgb_leakage.predict_proba(X_test2)[:, 1]

# Znajd≈∫ optymalny threshold
thresholds = np.arange(0.3, 0.7, 0.02)
f1_scores = []
for thresh in thresholds:
    y_pred = (y_proba2 >= thresh).astype(int)
    f1_scores.append(f1_score(y_test2, y_pred))

optimal_threshold = thresholds[np.argmax(f1_scores)]
y_pred2 = (y_proba2 >= optimal_threshold).astype(int)

# Wyniki
print("\n=== WYNIKI ETAP 2 (DATA LEAKAGE) ===")
print(f"Optymalny threshold: {optimal_threshold:.2f}")
print(f"Recall: {recall_score(y_test2, y_pred2)*100:.1f}% üöÄ")
print(f"Precision: {precision_score(y_test2, y_pred2)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test2, y_pred2):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test2, y_proba2):.3f}")

# Confusion matrix
cm2 = confusion_matrix(y_test2, y_pred2)
plt.figure(figsize=(8, 6))
sns.heatmap(cm2, annot=True, fmt='d', cmap='Reds')
plt.title('Confusion Matrix - Etap 2 (Data Leakage)')
plt.xlabel('Przewidywane')
plt.ylabel('Rzeczywiste')
plt.show()

# Feature importance z opisowymi etykietami
importance2 = pd.DataFrame({
    'feature': X_train2.columns,
    'importance': xgb_leakage.feature_importances_
}).sort_values('importance', ascending=False)

# Dodaj opisowe etykiety
importance2['label'] = importance2['feature'].apply(get_feature_label)

plt.figure(figsize=(12, 10))
top_features = importance2.head(15)
plt.barh(range(len(top_features)), top_features['importance'])

# Ustaw opisowe etykiety na osi Y
plt.yticks(range(len(top_features)), top_features['label'])

plt.xlabel('Wa≈ºno≈õƒá cechy', fontsize=12)
plt.title('Top 15 najwa≈ºniejszych cech - Etap 2 (Data Leakage)', fontsize=14)
plt.gca().invert_yaxis()

# Podkre≈õl problematycznƒÖ cechƒô
for i, (feature, label) in enumerate(zip(top_features['feature'], top_features['label'])):
    if feature == 'DELAY_LOG':
        plt.gca().get_yticklabels()[i].set_color('red')
        plt.gca().get_yticklabels()[i].set_weight('bold')
        plt.gca().get_yticklabels()[i].set_fontsize(12)
    else:
        plt.gca().get_yticklabels()[i].set_fontsize(11)

# Dodaj warto≈õci na s≈Çupkach
for i, v in enumerate(top_features['importance']):
    plt.text(v + 0.002, i, f'{v:.3f}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

print("\nüö® UWAGA: DELAY_LOG jest najwa≈ºniejszƒÖ cechƒÖ - to dow√≥d data leakage!")
print("Model 'oszukuje' u≈ºywajƒÖc informacji o op√≥≈∫nieniu do przewidywania op√≥≈∫nienia.")
print("\nOpisy najwa≈ºniejszych cech:")
for _, row in top_features.head(5).iterrows():
    print(f"- {row['feature']}: {row['label']} (wa≈ºno≈õƒá: {row['importance']:.3f})")

# ETAP 3: Fast Optimized Model (62% recall)

Model po usuniƒôciu data leakage, ale z b≈Çƒôdnym usuwaniem outlier√≥w.

In [None]:
print("="*50)
print("ETAP 3: FAST OPTIMIZED MODEL")
print("="*50)

# Kopia danych dla etapu 3
df_stage3 = df.copy()

# üö® B≈ÅƒÑD: Usuwanie ekstremalnych op√≥≈∫nie≈Ñ!
df_stage3 = df_stage3[(df_stage3['DEPARTURE_DELAY'] >= -30) & 
                      (df_stage3['DEPARTURE_DELAY'] <= 300)]  # Usuwamy trudne przypadki!

print(f"‚ö†Ô∏è UWAGA: Usuniƒôto {len(df) - len(df_stage3)} lot√≥w z ekstremalnymi op√≥≈∫nieniami")

# Sample
if len(df_stage3) > 300000:
    df_stage3 = df_stage3.sample(n=300000, random_state=42)

print(f"U≈ºywamy {len(df_stage3)} pr√≥bek")

# Zmienna docelowa
df_stage3['DELAYED'] = (df_stage3['DEPARTURE_DELAY'] > 15).astype(int)

# Feature engineering (21 cech, BEZ data leakage)
df_stage3['DEPARTURE_HOUR'] = df_stage3['SCHEDULED_DEPARTURE'].astype(str).str.zfill(4).str[:2].astype(int)

# Cyclical encoding
df_stage3['HOUR_SIN'] = np.sin(2 * np.pi * df_stage3['DEPARTURE_HOUR'] / 24)
df_stage3['HOUR_COS'] = np.cos(2 * np.pi * df_stage3['DEPARTURE_HOUR'] / 24)

# Time features
df_stage3['IS_RUSH_HOUR'] = (
    ((df_stage3['DEPARTURE_HOUR'] >= 7) & (df_stage3['DEPARTURE_HOUR'] <= 9)) |
    ((df_stage3['DEPARTURE_HOUR'] >= 17) & (df_stage3['DEPARTURE_HOUR'] <= 19))
).astype(int)

df_stage3['IS_WEEKEND'] = (df_stage3['DAY_OF_WEEK'].isin([6, 7])).astype(int)
df_stage3['IS_FRIDAY'] = (df_stage3['DAY_OF_WEEK'] == 5).astype(int)

# Airport congestion
df_stage3['ORIGIN_CONGESTION'] = df_stage3.groupby('ORIGIN_AIRPORT')['ORIGIN_AIRPORT'].transform('count')
df_stage3['DEST_CONGESTION'] = df_stage3.groupby('DESTINATION_AIRPORT')['DESTINATION_AIRPORT'].transform('count')

# Airline delay rate
airline_delay_rate3 = df_stage3.groupby('AIRLINE')['DELAYED'].mean()
df_stage3['AIRLINE_DELAY_RATE'] = df_stage3['AIRLINE'].map(airline_delay_rate3)

# Route popularity
df_stage3['ROUTE'] = df_stage3['ORIGIN_AIRPORT'] + '_' + df_stage3['DESTINATION_AIRPORT']
df_stage3['ROUTE_POPULARITY'] = df_stage3.groupby('ROUTE')['ROUTE'].transform('count')

# Distance bins
df_stage3['DISTANCE_BIN'] = pd.cut(df_stage3['DISTANCE'], 
                                   bins=[0, 500, 1000, 2000, 5000], 
                                   labels=['Short', 'Medium', 'Long', 'VeryLong'])

# Cechy (21, bez data leakage)
feature_columns_stage3 = [
    'MONTH', 'DAY', 'DAY_OF_WEEK', 'DEPARTURE_HOUR',
    'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
    'DISTANCE', 'IS_WEEKEND', 'IS_FRIDAY', 'IS_RUSH_HOUR',
    'HOUR_SIN', 'HOUR_COS',
    'ORIGIN_CONGESTION', 'DEST_CONGESTION',
    'AIRLINE_DELAY_RATE', 'ROUTE_POPULARITY',
    'DISTANCE_BIN'
]

X_stage3 = df_stage3[feature_columns_stage3].copy()
y_stage3 = df_stage3['DELAYED']

print(f"\nCechy: {len(feature_columns_stage3)} (bez data leakage)")
print(f"Procent op√≥≈∫nie≈Ñ: {y_stage3.mean()*100:.2f}%")

In [None]:
# Label encoding
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE_BIN']
for col in categorical_columns:
    le = LabelEncoder()
    X_stage3[col] = le.fit_transform(X_stage3[col].astype(str))

# Podzia≈Ç
X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X_stage3, y_stage3, test_size=0.2, random_state=42, stratify=y_stage3
)

# Class weights
class_weights = class_weight.compute_class_weight(
    'balanced', classes=np.unique(y_train3), y=y_train3
)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.5)
X_train3_smote, y_train3_smote = smote.fit_resample(X_train3, y_train3)

# Trenowanie ensemble
print("\nTrenowanie modeli...")

# Random Forest
rf3 = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=20,
    min_samples_leaf=5,
    class_weight=class_weight_dict,
    random_state=42,
    n_jobs=-1
)
rf3.fit(X_train3_smote, y_train3_smote)

# XGBoost
scale_pos_weight = (y_train3 == 0).sum() / (y_train3 == 1).sum()
xgb3 = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)
xgb3.fit(X_train3_smote, y_train3_smote)

# LightGBM
lgb3 = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=8,
    learning_rate=0.1,
    class_weight=class_weight_dict,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
lgb3.fit(X_train3_smote, y_train3_smote)

# Ensemble
ensemble3 = VotingClassifier(
    estimators=[
        ('rf', rf3),
        ('xgb', xgb3),
        ('lgb', lgb3)
    ],
    voting='soft'
)
ensemble3.fit(X_train3, y_train3)

# Optymalizacja threshold dla ensemble
y_proba3 = ensemble3.predict_proba(X_test3)[:, 1]

thresholds = np.arange(0.3, 0.7, 0.02)
f1_scores = []
for thresh in thresholds:
    y_pred = (y_proba3 >= thresh).astype(int)
    f1_scores.append(f1_score(y_test3, y_pred))

optimal_threshold3 = thresholds[np.argmax(f1_scores)]
y_pred3 = (y_proba3 >= optimal_threshold3).astype(int)

# Wyniki
print("\n=== WYNIKI ETAP 3 (FAST OPTIMIZED) ===")
print(f"Optymalny threshold: {optimal_threshold3:.2f}")
print(f"Recall: {recall_score(y_test3, y_pred3)*100:.1f}% ‚úì")
print(f"Precision: {precision_score(y_test3, y_pred3)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test3, y_pred3):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test3, y_proba3):.3f}")

# Confusion matrix
cm3 = confusion_matrix(y_test3, y_pred3)
plt.figure(figsize=(8, 6))
sns.heatmap(cm3, annot=True, fmt='d', cmap='Oranges')
plt.title('Confusion Matrix - Etap 3 (Fast Optimized)')
plt.xlabel('Przewidywane')
plt.ylabel('Rzeczywiste')
plt.show()

print("\n‚ö†Ô∏è PROBLEM: Wysoki recall, ale usunƒôli≈õmy najtrudniejsze przypadki (>300 min)!")

# ETAP 4: Final Optimized Model (54.4% recall)

Uczciwy model zachowujƒÖcy WSZYSTKIE op√≥≈∫nienia, w≈ÇƒÖcznie z ekstremalnymi.

In [None]:
print("="*50)
print("ETAP 4: FINAL OPTIMIZED MODEL")
print("="*50)

# Kopia danych dla etapu 4
df_stage4 = df.copy()

# ‚úì POPRAWKA: Zachowujemy WSZYSTKIE op√≥≈∫nienia!
df_stage4 = df_stage4[df_stage4['DEPARTURE_DELAY'] >= -60]  # Tylko ekstremalne b≈Çƒôdy danych

print(f"‚úì Zachowano wszystkie op√≥≈∫nienia, w≈ÇƒÖcznie z ekstremalnymi")
print(f"Max op√≥≈∫nienie: {df_stage4['DEPARTURE_DELAY'].max():.0f} minut")
print(f"Op√≥≈∫nienia >300 min: {(df_stage4['DEPARTURE_DELAY'] > 300).sum()}")

# Sample
if len(df_stage4) > 300000:
    df_stage4 = df_stage4.sample(n=300000, random_state=42)

# Zmienna docelowa
df_stage4['DELAYED'] = (df_stage4['DEPARTURE_DELAY'] > 15).astype(int)

# Zaawansowany feature engineering (28 cech)
df_stage4['DEPARTURE_HOUR'] = df_stage4['SCHEDULED_DEPARTURE'].astype(str).str.zfill(4).str[:2].astype(int)
df_stage4['DEPARTURE_MINUTE'] = df_stage4['SCHEDULED_DEPARTURE'].astype(str).str.zfill(4).str[2:].astype(int)

# Cyclical encoding
df_stage4['HOUR_SIN'] = np.sin(2 * np.pi * df_stage4['DEPARTURE_HOUR'] / 24)
df_stage4['HOUR_COS'] = np.cos(2 * np.pi * df_stage4['DEPARTURE_HOUR'] / 24)
df_stage4['MONTH_SIN'] = np.sin(2 * np.pi * df_stage4['MONTH'] / 12)
df_stage4['MONTH_COS'] = np.cos(2 * np.pi * df_stage4['MONTH'] / 12)

# Time-based features
df_stage4['IS_RUSH_HOUR'] = (
    ((df_stage4['DEPARTURE_HOUR'] >= 7) & (df_stage4['DEPARTURE_HOUR'] <= 9)) |
    ((df_stage4['DEPARTURE_HOUR'] >= 17) & (df_stage4['DEPARTURE_HOUR'] <= 19))
).astype(int)

df_stage4['IS_LATE_NIGHT'] = (
    (df_stage4['DEPARTURE_HOUR'] >= 22) | (df_stage4['DEPARTURE_HOUR'] <= 5)
).astype(int)

df_stage4['IS_EARLY_MORNING'] = (
    (df_stage4['DEPARTURE_HOUR'] >= 4) & (df_stage4['DEPARTURE_HOUR'] <= 6)
).astype(int)

# Weekend/Holiday
df_stage4['IS_WEEKEND'] = (df_stage4['DAY_OF_WEEK'].isin([6, 7])).astype(int)
df_stage4['IS_FRIDAY'] = (df_stage4['DAY_OF_WEEK'] == 5).astype(int)
df_stage4['IS_MONDAY'] = (df_stage4['DAY_OF_WEEK'] == 1).astype(int)

df_stage4['IS_HOLIDAY_SEASON'] = (
    ((df_stage4['MONTH'] == 12) & (df_stage4['DAY'] >= 20)) |
    ((df_stage4['MONTH'] == 11) & (df_stage4['DAY'] >= 22) & (df_stage4['DAY'] <= 28)) |
    ((df_stage4['MONTH'] == 7) & (df_stage4['DAY'] <= 7)) |
    ((df_stage4['MONTH'] == 1) & (df_stage4['DAY'] <= 3))
).astype(int)

# Airport features
origin_counts = df_stage4['ORIGIN_AIRPORT'].value_counts()
dest_counts = df_stage4['DESTINATION_AIRPORT'].value_counts()
df_stage4['ORIGIN_BUSY'] = df_stage4['ORIGIN_AIRPORT'].map(origin_counts)
df_stage4['DEST_BUSY'] = df_stage4['DESTINATION_AIRPORT'].map(dest_counts)

# Route features
df_stage4['ROUTE'] = df_stage4['ORIGIN_AIRPORT'] + '_' + df_stage4['DESTINATION_AIRPORT']
df_stage4['ROUTE_FREQ'] = df_stage4['ROUTE'].map(df_stage4['ROUTE'].value_counts())

# Airline features
airline_delay_rate = df_stage4.groupby('AIRLINE')['DELAYED'].mean()
df_stage4['AIRLINE_DELAY_RATE'] = df_stage4['AIRLINE'].map(airline_delay_rate)

# Origin airport delay rate
origin_delay_rate = df_stage4.groupby('ORIGIN_AIRPORT')['DELAYED'].mean()
df_stage4['ORIGIN_DELAY_RATE'] = df_stage4['ORIGIN_AIRPORT'].map(origin_delay_rate)

# Distance features
df_stage4['DISTANCE_BIN'] = pd.cut(df_stage4['DISTANCE'], 
                                   bins=[0, 500, 1000, 2000, 5000], 
                                   labels=['Short', 'Medium', 'Long', 'VeryLong'])

# Interaction features
df_stage4['RUSH_AIRLINE'] = df_stage4['IS_RUSH_HOUR'] * df_stage4['AIRLINE_DELAY_RATE']
df_stage4['HOLIDAY_ORIGIN'] = df_stage4['IS_HOLIDAY_SEASON'] * df_stage4['ORIGIN_DELAY_RATE']
df_stage4['HOUR_AIRLINE'] = df_stage4['DEPARTURE_HOUR'] * df_stage4['AIRLINE_DELAY_RATE'] / 24

# Cechy finalne (28)
feature_columns_stage4 = [
    # Base features
    'MONTH', 'DAY', 'DAY_OF_WEEK', 'DEPARTURE_HOUR',
    'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE',
    
    # Time features
    'IS_WEEKEND', 'IS_FRIDAY', 'IS_MONDAY', 'IS_RUSH_HOUR', 
    'IS_LATE_NIGHT', 'IS_EARLY_MORNING',
    'HOUR_SIN', 'HOUR_COS', 'MONTH_SIN', 'MONTH_COS',
    
    # Holiday
    'IS_HOLIDAY_SEASON',
    
    # Airport/Route features
    'ORIGIN_BUSY', 'DEST_BUSY', 'ROUTE_FREQ',
    'AIRLINE_DELAY_RATE', 'ORIGIN_DELAY_RATE',
    
    # Distance
    'DISTANCE_BIN',
    
    # Interactions
    'RUSH_AIRLINE', 'HOLIDAY_ORIGIN', 'HOUR_AIRLINE'
]

X_stage4 = df_stage4[feature_columns_stage4].copy()
y_stage4 = df_stage4['DELAYED']

print(f"\nCechy: {len(feature_columns_stage4)}")
print(f"Procent op√≥≈∫nie≈Ñ: {y_stage4.mean()*100:.2f}%")

In [ ]:
# Label encoding
categorical_columns = ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'DISTANCE_BIN']
for col in categorical_columns:
    le = LabelEncoder()
    X_stage4[col] = le.fit_transform(X_stage4[col].astype(str))

# Podzia≈Ç
X_train4, X_test4, y_train4, y_test4 = train_test_split(
    X_stage4, y_stage4, test_size=0.2, random_state=42, stratify=y_stage4
)

# Class weights
class_weights = class_weight.compute_class_weight(
    'balanced', classes=np.unique(y_train4), y=y_train4
)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.6)
X_train4_smote, y_train4_smote = smote.fit_resample(X_train4, y_train4)

# Trenowanie najlepszego modelu - XGBoost
print("\nTrenowanie finalnego modelu XGBoost...")
scale_pos_weight = (y_train4 == 0).sum() / (y_train4 == 1).sum()

xgb_final = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    gamma=0.1,
    random_state=42,
    n_jobs=-1
)

start = time.time()
xgb_final.fit(X_train4_smote, y_train4_smote)
print(f"Czas trenowania: {time.time()-start:.1f}s")

# Optymalizacja threshold
y_proba4 = xgb_final.predict_proba(X_test4)[:, 1]

thresholds = np.arange(0.3, 0.7, 0.02)
f1_scores = []
for thresh in thresholds:
    y_pred = (y_proba4 >= thresh).astype(int)
    f1_scores.append(f1_score(y_test4, y_pred))

optimal_threshold4 = thresholds[np.argmax(f1_scores)]
y_pred4 = (y_proba4 >= optimal_threshold4).astype(int)

# Wyniki
print("\n=== WYNIKI ETAP 4 (FINAL MODEL) ===")
print(f"Optymalny threshold: {optimal_threshold4:.2f}")
print(f"Recall: {recall_score(y_test4, y_pred4)*100:.1f}%")
print(f"Precision: {precision_score(y_test4, y_pred4)*100:.1f}%")
print(f"F1-Score: {f1_score(y_test4, y_pred4):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test4, y_proba4):.3f}")

# Analiza ekstremalnych op√≥≈∫nie≈Ñ
test_indices = X_test4.index
extreme_delays_mask = df_stage4.loc[test_indices, 'DEPARTURE_DELAY'] > 300
if extreme_delays_mask.sum() > 0:
    extreme_y_true = y_test4[extreme_delays_mask]
    extreme_y_pred = y_pred4[extreme_delays_mask]
    extreme_recall = recall_score(extreme_y_true, extreme_y_pred)
    print(f"\nRecall dla ekstremalnych op√≥≈∫nie≈Ñ (>300 min): {extreme_recall*100:.1f}%")
    print(f"Wykryto {extreme_y_pred.sum()}/{len(extreme_y_true)} ekstremalnych op√≥≈∫nie≈Ñ")

# Confusion matrix
cm4 = confusion_matrix(y_test4, y_pred4)
plt.figure(figsize=(8, 6))
sns.heatmap(cm4, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix - Etap 4 (Final Model)')
plt.xlabel('Przewidywane')
plt.ylabel('Rzeczywiste')
plt.show()

# Feature importance z opisowymi etykietami
importance4 = pd.DataFrame({
    'feature': X_train4.columns,
    'importance': xgb_final.feature_importances_
}).sort_values('importance', ascending=False)

# Dodaj opisowe etykiety
importance4['label'] = importance4['feature'].apply(get_feature_label)

plt.figure(figsize=(12, 10))
top_features = importance4.head(15)
plt.barh(range(len(top_features)), top_features['importance'])

# Ustaw opisowe etykiety na osi Y
plt.yticks(range(len(top_features)), top_features['label'])

plt.xlabel('Wa≈ºno≈õƒá cechy', fontsize=12)
plt.title('Top 15 najwa≈ºniejszych cech - Final Model (Uczciwy model)', fontsize=14)
plt.gca().invert_yaxis()

# Ulepszone kolorowanie wed≈Çug typu cechy
colors = []
for feature in top_features['feature']:
    # Cechy czasowe bezpo≈õrednie
    if feature in ['IS_RUSH_HOUR', 'IS_WEEKEND', 'IS_FRIDAY', 'IS_MONDAY', 
                   'IS_LATE_NIGHT', 'IS_EARLY_MORNING', 'IS_HOLIDAY_SEASON']:
        colors.append('coral')  # Cechy czasowe binarne
    # Cechy czasowe cykliczne
    elif feature in ['HOUR_SIN', 'HOUR_COS', 'MONTH_SIN', 'MONTH_COS']:
        colors.append('lightsalmon')  # Cechy czasowe cykliczne
    # Podstawowe cechy czasowe
    elif feature in ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DEPARTURE_HOUR']:
        colors.append('peachpuff')  # Podstawowe cechy czasowe
    # Cechy lotniskowe
    elif 'ORIGIN' in feature or 'DEST' in feature or 'AIRPORT' in feature:
        colors.append('skyblue')  # Cechy lotniskowe
    # Cechy linii lotniczych
    elif 'AIRLINE' in feature:
        colors.append('lightgreen')  # Cechy linii lotniczych
    # Cechy dystansu
    elif 'DISTANCE' in feature:
        colors.append('gold')  # Cechy dystansu
    # Cechy tras
    elif 'ROUTE' in feature:
        colors.append('plum')  # Cechy tras
    # Cechy interakcyjne/ryzyko
    elif feature in ['RUSH_AIRLINE', 'HOLIDAY_ORIGIN', 'HOUR_AIRLINE']:
        colors.append('lightcoral')  # Cechy interakcyjne
    else:
        colors.append('lightgray')  # Pozosta≈Çe

bars = plt.barh(range(len(top_features)), top_features['importance'], color=colors)

# Dodaj warto≈õci na s≈Çupkach
for i, v in enumerate(top_features['importance']):
    plt.text(v + 0.002, i, f'{v:.3f}', va='center', fontsize=10)

# Ulepszona legenda
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='coral', label='Cechy czasowe (binarne)'),
    Patch(facecolor='lightsalmon', label='Cechy czasowe (cykliczne)'),
    Patch(facecolor='peachpuff', label='Cechy czasowe (podstawowe)'),
    Patch(facecolor='skyblue', label='Cechy lotniskowe'),
    Patch(facecolor='lightgreen', label='Cechy linii lotniczych'),
    Patch(facecolor='gold', label='Cechy dystansu'),
    Patch(facecolor='plum', label='Cechy tras'),
    Patch(facecolor='lightcoral', label='Cechy interakcyjne/ryzyko')
]
plt.legend(handles=legend_elements, loc='lower right', fontsize=9, ncol=2)

plt.tight_layout()
plt.show()

print("\n‚úì Model uczciwie radzi sobie ze WSZYSTKIMI op√≥≈∫nieniami")
print("‚úì Najwa≈ºniejsze cechy sƒÖ zwiƒÖzane z czasem (godziny szczytu) i lotniskami")
print("\nOpisy TOP 5 najwa≈ºniejszych cech:")
for i, row in top_features.head(5).iterrows():
    print(f"{i+1}. {row['feature']}: {row['label']} (wa≈ºno≈õƒá: {row['importance']:.3f})")

# Podsumowanie: Por√≥wnanie wszystkich etap√≥w

In [ ]:
# Zbierz wyniki ze wszystkich etap√≥w
results_summary = pd.DataFrame({
    'Etap': ['1: Baseline', '2: Data Leakage', '3: Fast Optimized', '4: Final Model'],
    'Recall': [
        recall_score(y_test1, y_pred_xgb1)*100,  # Etap 1
        recall_score(y_test2, y_pred2)*100,      # Etap 2
        recall_score(y_test3, y_pred3)*100,      # Etap 3
        recall_score(y_test4, y_pred4)*100       # Etap 4
    ],
    'F1-Score': [
        f1_score(y_test1, y_pred_xgb1),
        f1_score(y_test2, y_pred2),
        f1_score(y_test3, y_pred3),
        f1_score(y_test4, y_pred4)
    ],
    'Cechy': [12, 27, 21, 28],
    'Problem': [
        'Zbyt prosty model',
        'Data leakage (DELAY_LOG)',
        'Usuniƒôto outliery >300 min',
        'Uczciwy model ze wszystkim'
    ]
})

print("=== PODSUMOWANIE WSZYSTKICH ETAP√ìW ===")
print(results_summary.to_string(index=False))

# Wizualizacja ewolucji
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Recall
bars1 = ax1.bar(results_summary['Etap'], results_summary['Recall'], 
                color=['blue', 'red', 'orange', 'green'])
ax1.set_ylabel('Recall (%)')
ax1.set_title('Ewolucja Recall przez etapy')
ax1.set_ylim(0, 110)

# Dodaj warto≈õci na s≈Çupkach
for i, bar in enumerate(bars1):
    height = bar.get_height()
    # Dla wysokich s≈Çupk√≥w (>90%) umie≈õƒá etykietƒô wewnƒÖtrz s≈Çupka
    if height > 90:
        ax1.text(bar.get_x() + bar.get_width()/2., height - 5,
                 f'{height:.1f}%', ha='center', va='top', 
                 color='white', fontweight='bold')
    else:
        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                 f'{height:.1f}%', ha='center', va='bottom')

# F1-Score - ZWIƒòKSZONY LIMIT DO 1.0
bars2 = ax2.bar(results_summary['Etap'], results_summary['F1-Score'], 
                color=['blue', 'red', 'orange', 'green'])
ax2.set_ylabel('F1-Score')
ax2.set_title('Ewolucja F1-Score przez etapy')
ax2.set_ylim(0, 1.0)  # Zwiƒôkszony do 1.0

# Dodaj warto≈õci na s≈Çupkach - WSZYSTKIE WEWNƒÑTRZ DLA SP√ìJNO≈öCI
for i, bar in enumerate(bars2):
    height = bar.get_height()
    # Umie≈õƒá wszystkie etykiety wewnƒÖtrz s≈Çupk√≥w dla sp√≥jno≈õci z lewym wykresem
    ax2.text(bar.get_x() + bar.get_width()/2., height - 0.03,
             f'{height:.3f}', ha='center', va='top',
             color='white', fontweight='bold', fontsize=11)

# Dodaj adnotacje
ax1.annotate('Podejrzane!', 
            xy=(1, results_summary.loc[1, 'Recall']), 
            xytext=(1, 85),
            arrowprops=dict(arrowstyle='->', color='red', lw=2),
            ha='center', fontsize=10, color='red', fontweight='bold')

ax2.annotate('Sztucznie wysoki\n(data leakage)', 
            xy=(1, results_summary.loc[1, 'F1-Score']), 
            xytext=(1, 0.85),
            arrowprops=dict(arrowstyle='->', color='red', lw=1.5),
            ha='center', fontsize=9, color='red')

plt.tight_layout()
plt.show()

# Krzywe ROC
plt.figure(figsize=(10, 8))

# Oblicz krzywe ROC dla ka≈ºdego etapu
fpr1, tpr1, _ = roc_curve(y_test1, xgb_baseline.predict_proba(X_test1)[:, 1])
fpr2, tpr2, _ = roc_curve(y_test2, y_proba2)
fpr3, tpr3, _ = roc_curve(y_test3, y_proba3)
fpr4, tpr4, _ = roc_curve(y_test4, y_proba4)

# Wykresy
plt.plot(fpr1, tpr1, label=f'Etap 1: Baseline (AUC = {roc_auc_score(y_test1, xgb_baseline.predict_proba(X_test1)[:, 1]):.3f})', linewidth=2)
plt.plot(fpr2, tpr2, label=f'Etap 2: Data Leakage (AUC = {roc_auc_score(y_test2, y_proba2):.3f})', linewidth=2, linestyle='--')
plt.plot(fpr3, tpr3, label=f'Etap 3: Fast Optimized (AUC = {roc_auc_score(y_test3, y_proba3):.3f})', linewidth=2)
plt.plot(fpr4, tpr4, label=f'Etap 4: Final Model (AUC = {roc_auc_score(y_test4, y_proba4):.3f})', linewidth=3)

plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Losowy klasyfikator')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Krzywe ROC - Por√≥wnanie wszystkich etap√≥w')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

print("\n=== KLUCZOWE WNIOSKI ===")
print("1. Etap 1 (Baseline): Zbyt konserwatywny model - tylko 10% recall")
print("2. Etap 2 (Data Leakage): Fa≈Çszywie wysoki recall 77.5% przez u≈ºycie DELAY_LOG")
print("3. Etap 3 (Fast Optimized): Dobry recall 62%, ale osiƒÖgniƒôty przez usuniƒôcie trudnych przypadk√≥w")
print("4. Etap 4 (Final Model): Uczciwy recall 54.4% na WSZYSTKICH danych")
print("\n‚úì Najlepszy uczciwy model: XGBoost z 28 cechami, F1=0.491, ROC-AUC=0.769")

## Analiza b≈Çƒôd√≥w i dalsze kroki

Zobaczmy, gdzie model finalny ma najwiƒôksze problemy.

In [None]:
# Analiza b≈Çƒôd√≥w
test_df = df_stage4.loc[X_test4.index].copy()
test_df['y_true'] = y_test4
test_df['y_pred'] = y_pred4
test_df['y_proba'] = y_proba4

# False Negatives (missed delays)
false_negatives = test_df[(test_df['y_true'] == 1) & (test_df['y_pred'] == 0)]
print(f"False Negatives (niewykryte op√≥≈∫nienia): {len(false_negatives)}")

# Analiza wed≈Çug wielko≈õci op√≥≈∫nienia
delay_bins = [15, 30, 60, 120, 300, 2000]
delay_labels = ['15-30 min', '30-60 min', '60-120 min', '120-300 min', '>300 min']

test_df['DELAY_BIN'] = pd.cut(test_df['DEPARTURE_DELAY'], bins=delay_bins, labels=delay_labels, include_lowest=False)

# Recall dla ka≈ºdej kategorii op√≥≈∫nienia
recall_by_delay = test_df[test_df['y_true'] == 1].groupby('DELAY_BIN').apply(
    lambda x: (x['y_pred'] == 1).sum() / len(x) * 100
)

plt.figure(figsize=(10, 6))
recall_by_delay.plot(kind='bar', color='coral')
plt.title('Recall wed≈Çug wielko≈õci op√≥≈∫nienia')
plt.xlabel('Kategoria op√≥≈∫nienia')
plt.ylabel('Recall (%)')
plt.xticks(rotation=45)
plt.axhline(y=50, color='red', linestyle='--', alpha=0.5)

# Dodaj warto≈õci na s≈Çupkach
for i, v in enumerate(recall_by_delay):
    plt.text(i, v + 1, f'{v:.1f}%', ha='center')

plt.tight_layout()
plt.show()

print("\nRecall wed≈Çug wielko≈õci op√≥≈∫nienia:")
for delay_cat, recall in recall_by_delay.items():
    print(f"{delay_cat}: {recall:.1f}%")

# Najczƒôstsze b≈Çƒôdy wed≈Çug lotnisk
print("\n=== LOTNISKA Z NAJNI≈ªSZYM RECALL ===")
airport_performance = test_df[test_df['y_true'] == 1].groupby('ORIGIN_AIRPORT').agg({
    'y_pred': ['sum', 'count']
})
airport_performance.columns = ['detected', 'total']
airport_performance['recall'] = airport_performance['detected'] / airport_performance['total'] * 100
airport_performance = airport_performance[airport_performance['total'] >= 10]  # Min 10 op√≥≈∫nie≈Ñ

worst_airports = airport_performance.nsmallest(10, 'recall')
print(worst_airports[['total', 'detected', 'recall']].round(1))

print("\n=== PROPOZYCJE DALSZYCH ULEPSZE≈É ===")
print("1. Model dwuetapowy:")
print("   - Etap 1: Klasyfikacja normal/extreme delay")
print("   - Etap 2: Dedykowane modele dla ka≈ºdej grupy")
print("\n2. Dodatkowe cechy:")
print("   - Dane pogodowe (mo≈ºna symulowaƒá na podstawie sezonu/lokalizacji)")
print("   - Agregacje historyczne (≈õrednie op√≥≈∫nienie na trasie ostatnie 7 dni)")
print("   - Cechy ekonomiczne (ceny paliwa, wska≈∫niki)")
print("\n3. Techniki modelowania:")
print("   - Stacking ensemble z meta-learnerem")
print("   - Custom loss function z wiƒôkszƒÖ wagƒÖ dla du≈ºych op√≥≈∫nie≈Ñ")
print("   - Neural network jako dodatkowy model")

## Zapisanie najlepszego modelu

In [None]:
# Zapisz model i wa≈ºne informacje
import joblib

# Zapisz model
joblib.dump(xgb_final, 'best_flight_delay_model.pkl')

# Zapisz metadane
model_metadata = {
    'model_type': 'XGBoost',
    'features': feature_columns_stage4,
    'n_features': len(feature_columns_stage4),
    'optimal_threshold': float(optimal_threshold4),
    'performance': {
        'recall': float(recall_score(y_test4, y_pred4)),
        'precision': float(precision_score(y_test4, y_pred4)),
        'f1_score': float(f1_score(y_test4, y_pred4)),
        'roc_auc': float(roc_auc_score(y_test4, y_proba4))
    },
    'training_samples': len(X_train4),
    'test_samples': len(X_test4)
}

import json
with open('model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

print("‚úì Model zapisany jako 'best_flight_delay_model.pkl'")
print("‚úì Metadane zapisane jako 'model_metadata.json'")
print("\n=== PROJEKT ZAKO≈ÉCZONY ===")
print(f"Najlepszy model: {model_metadata['model_type']}")
print(f"F1-Score: {model_metadata['performance']['f1_score']:.3f}")
print(f"ROC-AUC: {model_metadata['performance']['roc_auc']:.3f}")
print(f"Recall: {model_metadata['performance']['recall']*100:.1f}%")