<a href="https://colab.research.google.com/github/gustavocoradin/Projeto-transformador/blob/main/NeuralNetworkProjetoTransformador.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
import time
import psutil
from memory_profiler import memory_usage

In [None]:
import kagglehub

path = kagglehub.dataset_download("usdot/flight-delays")
print("Path to dataset files:", path)

In [None]:
df = pd.read_csv(path + "/flights.csv")
print(f"Dataset carregado com shape: {df.shape}")
print(f"Colunas: {list(df.columns)}")

In [None]:
print("Valores nulos por coluna (%):")
print(df.isna().sum() * 100 / len(df))

delay_reason_cols = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
df[delay_reason_cols] = df[delay_reason_cols].fillna(0)

df.drop(['CANCELLATION_REASON', 'FLIGHT_NUMBER', 'CANCELLED'], axis=1, inplace=True, errors='ignore')

df.drop('TAIL_NUMBER', axis=1, inplace=True)

df.drop(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE'], axis=1, inplace=True, errors='ignore')

df = df.dropna(subset=['DEPARTURE_TIME','DEPARTURE_DELAY','TAXI_OUT','WHEELS_OFF','SCHEDULED_TIME','ELAPSED_TIME','AIR_TIME','WHEELS_ON','TAXI_IN','ARRIVAL_TIME','ARRIVAL_DELAY'])

print(f"\nShape ap√≥s limpeza: {df.shape}")
print("\nValores nulos restantes (%):")
print(df.isna().sum() * 100 / len(df))

In [None]:

df['DELAYED'] = df['ARRIVAL_DELAY'].apply(lambda x: 1 if x > 15 else 0)

delay = df[df['DELAYED'] == 1]
on_time = df[df['DELAYED'] == 0]

delay_count = delay.shape[0]
on_time_count = on_time.shape[0]
total = delay_count + on_time_count
delay_percentage = (delay_count / total) * 100
on_time_percentage = (on_time_count / total) * 100

print(f"N√£o atrasados: {on_time_count:,}".replace(",", "."))
print(f"Atrasados: {delay_count:,}".replace(",", "."))
print(f"Percentual de voos n√£o atrasados: {on_time_percentage:.2f}%")
print(f"Percentual de voos atrasados: {delay_percentage:.2f}%")

In [None]:

from imblearn.under_sampling import RandomUnderSampler

X = df.drop(columns=['DELAYED'])
y = df['DELAYED']

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

df_undersampled = pd.concat([X_resampled, y_resampled], axis=1)

on_time_count = (df_undersampled['DELAYED'] == 0).sum()
delay_count = (df_undersampled['DELAYED'] == 1).sum()
total = on_time_count + delay_count
on_time_percentage = (on_time_count / total) * 100
delay_percentage = (delay_count / total) * 100

print(f"\nAp√≥s undersampling:")
print(f"N√£o atrasados: {on_time_count:,}".replace(",", "."))
print(f"Atrasados: {delay_count:,}".replace(",", "."))
print(f"Percentual de voos n√£o atrasados: {on_time_percentage:.2f}%")
print(f"Percentual de voos atrasados: {delay_percentage:.2f}%")
print(f"Shape do dataset balanceado: {df_undersampled.shape}")

In [None]:
df_delayed = df_undersampled[df_undersampled['DELAYED'] == 1]
df_on_time = df_undersampled[df_undersampled['DELAYED'] == 0]

sample_size_per_class = 1000000

df_delayed_sample = df_delayed.sample(n=sample_size_per_class, random_state=42)
df_on_time_sample = df_on_time.sample(n=sample_size_per_class, random_state=42)

df_sample = pd.concat([df_delayed_sample, df_on_time_sample])

df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)

print("Shape do DataFrame original:", df_undersampled.shape)
print("Shape do DataFrame amostrado:", df_sample.shape)
print("\nDistribui√ß√£o da classe 'DELAYED' no DataFrame amostrado:")
print(df_sample['DELAYED'].value_counts())
print(f"\n‚úÖ Neural Networks podem processar {sample_size_per_class*2:,} amostras com excelente escalabilidade")

In [None]:
X_sample = df_sample.drop(columns=['DELAYED'])
y_sample = df_sample['DELAYED']

X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample,
    test_size=0.3,
    stratify=y_sample,
    random_state=42
)

print(f"Tamanho do conjunto de treino: {X_train.shape[0]}")
print(f"Tamanho do conjunto de teste: {X_test.shape[0]}")
print(f"Distribui√ß√£o no treino: {y_train.value_counts().to_dict()}")
print(f"Distribui√ß√£o no teste: {y_test.value_counts().to_dict()}")

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("‚úÖ Normaliza√ß√£o conclu√≠da com StandardScaler")
print(f"M√©dia das features de treino ap√≥s normaliza√ß√£o: {X_train_scaled.mean().mean():.6f}")
print(f"Desvio padr√£o das features de treino ap√≥s normaliza√ß√£o: {X_train_scaled.std().mean():.6f}")
print(f"\nüìä Estat√≠sticas p√≥s-normaliza√ß√£o (treino):")
print(f"Min: {X_train_scaled.min().min():.3f}")
print(f"Max: {X_train_scaled.max().max():.3f}")
print(f"M√©dia: {X_train_scaled.mean().mean():.6f}")
print(f"Std: {X_train_scaled.std().mean():.6f}")

In [None]:

nn_model = MLPClassifier(
    hidden_layer_sizes=(64, 32),   
    activation='relu',              
    solver='adam',                 
    learning_rate_init=0.001,       
    max_iter=500,                
    random_state=42,
    early_stopping=True,            
    validation_fraction=0.1,       
    n_iter_no_change=10              
)

start_time_train = time.time()
nn_model.fit(X_train_scaled, y_train)
end_time_train = time.time()

training_time = end_time_train - start_time_train
print(f"\nTempo de Treinamento: {training_time:.4f} segundos")
print(f"Modelo Neural Network treinado com arquitetura: {nn_model.hidden_layer_sizes}")
print(f"N√∫mero de itera√ß√µes realizadas: {nn_model.n_iter_}")
print(f"Fun√ß√£o de ativa√ß√£o: {nn_model.activation}")
print(f"Otimizador: {nn_model.solver}")
print(f"Taxa de aprendizado: {nn_model.learning_rate_init}")
print(f"Early stopping ativado: {nn_model.early_stopping}")

In [None]:
start_time_pred = time.time()
y_pred = nn_model.predict(X_test_scaled)
end_time_pred = time.time()

prediction_time = end_time_pred - start_time_pred
print(f"Tempo de Predi√ß√£o: {prediction_time:.4f} segundos")
print(f"Predi√ß√µes realizadas para {len(X_test_scaled)} amostras")

y_pred_proba = nn_model.predict_proba(X_test_scaled)
print(f"\nüìä Probabilidades calculadas para an√°lise de confian√ßa")
print(f"Probabilidade m√©dia para classe 0: {y_pred_proba[:, 0].mean():.4f}")
print(f"Probabilidade m√©dia para classe 1: {y_pred_proba[:, 1].mean():.4f}")


print(f"\nüîÑ Informa√ß√µes de Converg√™ncia:")
print(f"Loss final: {nn_model.loss_:.6f}")
print(f"Convergiu: {'Sim' if nn_model.n_iter_ < nn_model.max_iter else 'N√£o (max_iter atingido)'}")

In [None]:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tpr = recall 
tnr = tn / (tn + fp)

cm = confusion_matrix(y_test, y_pred)

print("=== M√âTRICAS NO CONJUNTO DE TESTE ===")
print(f"Acur√°cia: {accuracy:.4f}")
print(f"Precis√£o: {precision:.4f}")
print(f"Recall (TPR): {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"True Negative Rate (TNR): {tnr:.4f}")
print("\nMatriz de Confus√£o:")
print(cm)

In [None]:

y_train_pred = nn_model.predict(X_train_scaled)

accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
tpr_train = recall_train
tnr_train = tn / (tn + fp)

print("=== M√âTRICAS NO CONJUNTO DE TREINO ===")
print(f"Acur√°cia: {accuracy_train:.4f}")
print(f"Precis√£o: {precision_train:.4f}")
print(f"Recall (TPR): {recall_train:.4f}")
print(f"F1-score: {f1_train:.4f}")
print(f"True Negative Rate (TNR): {tnr_train:.4f}")

print("\n=== COMPARA√á√ÉO TREINO vs TESTE ===")
print(f"Diferen√ßa de Acur√°cia: {accuracy_train - accuracy:.4f}")
print(f"Diferen√ßa de F1-score: {f1_train - f1:.4f}")

# Detectando overfitting
acc_diff = accuracy_train - accuracy
if acc_diff > 0.05:
    print(f"\n‚ö†Ô∏è  POSS√çVEL OVERFITTING DETECTADO!")
    print(f"Diferen√ßa de acur√°cia treino-teste: {acc_diff:.4f}")
else:
    print(f"\n‚úÖ Modelo parece estar generalizando bem.")
    print(f"Diferen√ßa de acur√°cia treino-teste: {acc_diff:.4f}")

In [None]:
from sklearn.pipeline import Pipeline

print("\n" + "="*50)
print("VALIDA√á√ÉO CRUZADA 5-FOLDS - NEURAL NETWORK")
print("="*50)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

nn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('nn', MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation='relu',
        solver='adam',
        learning_rate_init=0.001,
        max_iter=500,
        random_state=42,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10
    ))
])

cv_results = cross_validate(nn_pipeline, X_sample, y_sample, cv=cv, scoring=scoring, return_train_score=True)

print(f"Acur√°cia - Treino: {cv_results['train_accuracy'].mean():.4f} ¬± {cv_results['train_accuracy'].std():.4f}")
print(f"Acur√°cia - Valida√ß√£o: {cv_results['test_accuracy'].mean():.4f} ¬± {cv_results['test_accuracy'].std():.4f}")
print(f"Precis√£o - Treino: {cv_results['train_precision'].mean():.4f} ¬± {cv_results['train_precision'].std():.4f}")
print(f"Precis√£o - Valida√ß√£o: {cv_results['test_precision'].mean():.4f} ¬± {cv_results['test_precision'].std():.4f}")
print(f"Recall - Treino: {cv_results['train_recall'].mean():.4f} ¬± {cv_results['train_recall'].std():.4f}")
print(f"Recall - Valida√ß√£o: {cv_results['test_recall'].mean():.4f} ¬± {cv_results['test_recall'].std():.4f}")
print(f"F1-score - Treino: {cv_results['train_f1'].mean():.4f} ¬± {cv_results['train_f1'].std():.4f}")
print(f"F1-score - Valida√ß√£o: {cv_results['test_f1'].mean():.4f} ¬± {cv_results['test_f1'].std():.4f}")

cv_acc_diff = cv_results['train_accuracy'].mean() - cv_results['test_accuracy'].mean()
if cv_acc_diff > 0.05:
    print(f"\n‚ö†Ô∏è  POSS√çVEL OVERFITTING DETECTADO NA VALIDA√á√ÉO CRUZADA!")
    print(f"Diferen√ßa de acur√°cia treino-valida√ß√£o: {cv_acc_diff:.4f}")
else:
    print(f"\n‚úÖ Modelo generalizando bem na valida√ß√£o cruzada.")
    print(f"Diferen√ßa de acur√°cia treino-valida√ß√£o: {cv_acc_diff:.4f}")

In [None]:
process = psutil.Process()
nn_perf = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    max_iter=500,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10
)


def train_nn_model():
    nn_perf.fit(X_train_scaled, y_train)


start_time_train = time.time()
mem_usage_train = memory_usage(train_nn_model)
end_time_train = time.time()
training_time = end_time_train - start_time_train
train_ips = len(X_train_scaled) / training_time

def predict_nn_model():
    global y_pred_perf
    y_pred_perf = nn_perf.predict(X_test_scaled)

cpu_percent_before = process.cpu_percent(interval=None)
start_time_pred = time.time()
mem_usage_pred = memory_usage(predict_nn_model)
end_time_pred = time.time()
cpu_percent_after = process.cpu_percent(interval=None)

prediction_time = end_time_pred - start_time_pred
pred_ips = len(X_test_scaled) / prediction_time

print("\n" + "="*50)
print("AN√ÅLISE DE DESEMPENHO COMPUTACIONAL - NEURAL NETWORK")
print("="*50)
print(f"üïí Tempo de Treinamento: {training_time:.4f} s")
print(f"üïí Tempo de Predi√ß√£o: {prediction_time:.4f} s")
print(f"üìà Mem√≥ria (Treinamento): {max(mem_usage_train):.2f} MB")
print(f"üìà Mem√≥ria (Predi√ß√£o): {max(mem_usage_pred):.2f} MB")
print(f"‚öôÔ∏è CPU usada na predi√ß√£o: {cpu_percent_after:.2f}%")
print(f"üìä Inst√¢ncias por segundo (treinamento): {train_ips:.2f}")
print(f"üìä Inst√¢ncias por segundo (predi√ß√£o): {pred_ips:.2f}")
print(f"üß† Itera√ß√µes realizadas: {nn_perf.n_iter_}")
print(f"üß† Loss final: {nn_perf.loss_:.6f}")

daily_predictions = 24 * 60 * 60 * pred_ips  # predi√ß√µes por dia
print(f"\nüìà THROUGHPUT PARA CEN√ÅRIOS REAIS:")
print(f"Predi√ß√µes por segundo: {pred_ips:.0f}")
print(f"Predi√ß√µes por minuto: {pred_ips * 60:.0f}")
print(f"Predi√ß√µes por hora: {pred_ips * 3600:.0f}")
print(f"Predi√ß√µes por dia: {daily_predictions:.0f}")

print(f"\nüß† CARACTER√çSTICAS DA ARQUITETURA:")
print(f"Camadas ocultas: {nn_perf.hidden_layer_sizes}")
print(f"Total de par√¢metros: {sum([layer.size for layer in nn_perf.coefs_]) + sum([layer.size for layer in nn_perf.intercepts_])}")
print(f"Fun√ß√£o de ativa√ß√£o: {nn_perf.activation}")
print(f"Otimizador: {nn_perf.solver}")
print(f"Early stopping: {nn_perf.early_stopping}")