In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
import time
import psutil
from memory_profiler import memory_usage

In [None]:
import kagglehub

path = kagglehub.dataset_download("usdot/flight-delays")
print("Path to dataset files:", path)

In [None]:

df = pd.read_csv(path + "/flights.csv")
print(f"Dataset carregado com shape: {df.shape}")
print(f"Colunas: {list(df.columns)}")

In [None]:

print("Valores nulos por coluna (%):")
print(df.isna().sum() * 100 / len(df))

delay_reason_cols = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
df[delay_reason_cols] = df[delay_reason_cols].fillna(0)


df.drop(['CANCELLATION_REASON', 'FLIGHT_NUMBER', 'CANCELLED'], axis=1, inplace=True, errors='ignore')


df.drop('TAIL_NUMBER', axis=1, inplace=True)

df.drop(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE'], axis=1, inplace=True, errors='ignore')

df = df.dropna(subset=['DEPARTURE_TIME','DEPARTURE_DELAY','TAXI_OUT','WHEELS_OFF','SCHEDULED_TIME','ELAPSED_TIME','AIR_TIME','WHEELS_ON','TAXI_IN','ARRIVAL_TIME','ARRIVAL_DELAY'])

print(f"\nShape após limpeza: {df.shape}")
print("\nValores nulos restantes (%):")
print(df.isna().sum() * 100 / len(df))

In [None]:

df['DELAYED'] = df['ARRIVAL_DELAY'].apply(lambda x: 1 if x > 15 else 0)

delay = df[df['DELAYED'] == 1]
on_time = df[df['DELAYED'] == 0]

delay_count = delay.shape[0]
on_time_count = on_time.shape[0]
total = delay_count + on_time_count
delay_percentage = (delay_count / total) * 100
on_time_percentage = (on_time_count / total) * 100

print(f"Não atrasados: {on_time_count:,}".replace(",", "."))
print(f"Atrasados: {delay_count:,}".replace(",", "."))
print(f"Percentual de voos não atrasados: {on_time_percentage:.2f}%")
print(f"Percentual de voos atrasados: {delay_percentage:.2f}%")

In [None]:

from imblearn.under_sampling import RandomUnderSampler

X = df.drop(columns=['DELAYED'])
y = df['DELAYED']

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

df_undersampled = pd.concat([X_resampled, y_resampled], axis=1)


on_time_count = (df_undersampled['DELAYED'] == 0).sum()
delay_count = (df_undersampled['DELAYED'] == 1).sum()
total = on_time_count + delay_count
on_time_percentage = (on_time_count / total) * 100
delay_percentage = (delay_count / total) * 100

print(f"\nApós undersampling:")
print(f"Não atrasados: {on_time_count:,}".replace(",", "."))
print(f"Atrasados: {delay_count:,}".replace(",", "."))
print(f"Percentual de voos não atrasados: {on_time_percentage:.2f}%")
print(f"Percentual de voos atrasados: {delay_percentage:.2f}%")
print(f"Shape do dataset balanceado: {df_undersampled.shape}")

In [None]:

df_delayed = df_undersampled[df_undersampled['DELAYED'] == 1]
df_on_time = df_undersampled[df_undersampled['DELAYED'] == 0]

sample_size_per_class = 25000 

df_delayed_sample = df_delayed.sample(n=sample_size_per_class, random_state=42)
df_on_time_sample = df_on_time.sample(n=sample_size_per_class, random_state=42)


df_sample = pd.concat([df_delayed_sample, df_on_time_sample])


df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)

print("Shape do DataFrame original:", df_undersampled.shape)
print("Shape do DataFrame amostrado:", df_sample.shape)
print("\nDistribuição da classe 'DELAYED' no DataFrame amostrado:")
print(df_sample['DELAYED'].value_counts())
print(f"\n⚠️ Limitação para {sample_size_per_class*2:,} amostras devido à complexidade computacional O(n²-n³) do SVM")

In [None]:
X_sample = df_sample.drop(columns=['DELAYED'])
y_sample = df_sample['DELAYED']

X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample,
    test_size=0.3,
    stratify=y_sample,
    random_state=42
)

print(f"Tamanho do conjunto de treino: {X_train.shape[0]}")
print(f"Tamanho do conjunto de teste: {X_test.shape[0]}")
print(f"Distribuição no treino: {y_train.value_counts().to_dict()}")
print(f"Distribuição no teste: {y_test.value_counts().to_dict()}")

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("Normalização concluída com StandardScaler")
print(f"Shape treino normalizado: {X_train_scaled.shape}")
print(f"Shape teste normalizado: {X_test_scaled.shape}")
print(f"\nEstatísticas após normalização (treino):")
print(f"Média: {X_train_scaled.mean().mean():.6f}")
print(f"Desvio padrão: {X_train_scaled.std().mean():.6f}")

In [None]:

svm = SVC(
    kernel='rbf',      
    C=1.0,           
    gamma='scale',     
    random_state=42,
    probability=True    
)

start_time_train = time.time()
svm.fit(X_train_scaled, y_train)
end_time_train = time.time()

training_time = end_time_train - start_time_train
print(f"Tempo de Treinamento: {training_time:.4f} segundos")
print(f"Modelo SVM treinado com kernel '{svm.kernel}'")
print(f"Parâmetro C: {svm.C}")
print(f"Parâmetro gamma: {svm.gamma}")
print(f"Número de vetores de suporte: {svm.n_support_}")
print(f"Total de vetores de suporte: {svm.support_vectors_.shape[0]}")

In [None]:

start_time_pred = time.time()
y_pred = svm.predict(X_test_scaled)
end_time_pred = time.time()

prediction_time = end_time_pred - start_time_pred
print(f"Tempo de Predição: {prediction_time:.4f} segundos")
print(f"Predições realizadas para {len(X_test_scaled)} amostras")

In [None]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tpr = recall  
tnr = tn / (tn + fp)

cm = confusion_matrix(y_test, y_pred)

print("=== MÉTRICAS NO CONJUNTO DE TESTE ===")
print(f"Acurácia: {accuracy:.4f}")
print(f"Precisão: {precision:.4f}")
print(f"Recall (TPR): {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"True Negative Rate (TNR): {tnr:.4f}")
print("\nMatriz de Confusão:")
print(cm)

In [None]:

y_train_pred = svm.predict(X_train_scaled)

accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
tpr_train = recall_train
tnr_train = tn / (tn + fp)

print("=== MÉTRICAS NO CONJUNTO DE TREINO ===")
print(f"Acurácia: {accuracy_train:.4f}")
print(f"Precisão: {precision_train:.4f}")
print(f"Recall (TPR): {recall_train:.4f}")
print(f"F1-score: {f1_train:.4f}")
print(f"True Negative Rate (TNR): {tnr_train:.4f}")

print("\n=== COMPARAÇÃO TREINO vs TESTE ===")
print(f"Diferença de Acurácia: {accuracy_train - accuracy:.4f}")
print(f"Diferença de F1-score: {f1_train - f1:.4f}")

# Detectando overfitting
acc_diff = accuracy_train - accuracy
if acc_diff > 0.05:
    print(f"\n⚠️  POSSÍVEL OVERFITTING DETECTADO!")
    print(f"Diferença de acurácia treino-teste: {acc_diff:.4f}")
else:
    print(f"\n✅ Modelo parece estar generalizando bem.")
    print(f"Diferença de acurácia treino-teste: {acc_diff:.4f}")

In [None]:
print("\n" + "="*50)
print("VALIDAÇÃO CRUZADA 5-FOLDS - SVM")
print("="*50)
print("⚠️ Processo pode ser demorado devido à complexidade do SVM...")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

svm_cv = SVC(
    kernel='rbf', 
    C=1.0, 
    gamma='scale', 
    random_state=42
)


X_sample_scaled = scaler.fit_transform(X_sample)
X_sample_scaled = pd.DataFrame(X_sample_scaled, columns=X_sample.columns)

cv_results = cross_validate(svm_cv, X_sample_scaled, y_sample, cv=cv, scoring=scoring, return_train_score=True)

print(f"Acurácia - Treino: {cv_results['train_accuracy'].mean():.4f} ± {cv_results['train_accuracy'].std():.4f}")
print(f"Acurácia - Validação: {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
print(f"Precisão - Treino: {cv_results['train_precision'].mean():.4f} ± {cv_results['train_precision'].std():.4f}")
print(f"Precisão - Validação: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
print(f"Recall - Treino: {cv_results['train_recall'].mean():.4f} ± {cv_results['train_recall'].std():.4f}")
print(f"Recall - Validação: {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")
print(f"F1-score - Treino: {cv_results['train_f1'].mean():.4f} ± {cv_results['train_f1'].std():.4f}")
print(f"F1-score - Validação: {cv_results['test_f1'].mean():.4f} ± {cv_results['test_f1'].std():.4f}")
cv_acc_diff = cv_results['train_accuracy'].mean() - cv_results['test_accuracy'].mean()
if cv_acc_diff > 0.05:
    print(f"\n⚠️  POSSÍVEL OVERFITTING DETECTADO NA VALIDAÇÃO CRUZADA!")
    print(f"Diferença de acurácia treino-validação: {cv_acc_diff:.4f}")
else:
    print(f"\n✅ Modelo generalizando bem na validação cruzada.")
    print(f"Diferença de acurácia treino-validação: {cv_acc_diff:.4f}")

In [None]:
process = psutil.Process()

svm_perf = SVC(
    kernel='rbf', 
    C=1.0, 
    gamma='scale', 
    random_state=42
)

def train_svm_model():
    svm_perf.fit(X_train_scaled, y_train)

start_time_train = time.time()
mem_usage_train = memory_usage(train_svm_model)
end_time_train = time.time()
training_time = end_time_train - start_time_train
train_ips = len(X_train_scaled) / training_time


def predict_svm_model():
    global y_pred_perf
    y_pred_perf = svm_perf.predict(X_test_scaled)

cpu_percent_before = process.cpu_percent(interval=None)
start_time_pred = time.time()
mem_usage_pred = memory_usage(predict_svm_model)
end_time_pred = time.time()
cpu_percent_after = process.cpu_percent(interval=None)

prediction_time = end_time_pred - start_time_pred
pred_ips = len(X_test_scaled) / prediction_time

print("\n" + "="*50)
print("ANÁLISE DE DESEMPENHO COMPUTACIONAL - SVM")
print("="*50)
print(f"🕒 Tempo de Treinamento: {training_time:.4f} s")
print(f"🕒 Tempo de Predição: {prediction_time:.4f} s")
print(f"📈 Memória (Treinamento): {max(mem_usage_train):.2f} MB")
print(f"📈 Memória (Predição): {max(mem_usage_pred):.2f} MB")
print(f"⚙️ CPU usada na predição: {cpu_percent_after:.2f}%")
print(f"📊 Instâncias por segundo (treinamento): {train_ips:.2f}")
print(f"📊 Instâncias por segundo (predição): {pred_ips:.2f}")
print(f"🔢 Vetores de suporte: {svm_perf.support_vectors_.shape[0]} ({svm_perf.support_vectors_.shape[0]/len(X_train_scaled)*100:.1f}% dos dados)")

daily_predictions = 24 * 60 * 60 * pred_ips  # predições por dia
print(f"\n📈 THROUGHPUT PARA CENÁRIOS REAIS:")
print(f"Predições por segundo: {pred_ips:.0f}")
print(f"Predições por minuto: {pred_ips * 60:.0f}")
print(f"Predições por hora: {pred_ips * 3600:.0f}")
print(f"Predições por dia: {daily_predictions:.0f}")

print(f"\n⚠️ LIMITAÇÕES COMPUTACIONAIS:")
print(f"Dataset limitado a {len(X_sample):,} amostras devido à complexidade O(n²-n³)")
print(f"Para 1M amostras, tempo estimado: {training_time * (1000000/len(X_train_scaled))**2 / 3600:.1f} horas")