In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [23]:
np.random.seed(50)

In [24]:
df = pd.read_csv("flights.csv", low_memory=False)
df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [25]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5819079 entries, 0 to 5819078
Data columns (total 31 columns):
 #   Column               Dtype  
---  ------               -----  
 0   YEAR                 int64  
 1   MONTH                int64  
 2   DAY                  int64  
 3   DAY_OF_WEEK          int64  
 4   AIRLINE              object 
 5   FLIGHT_NUMBER        int64  
 6   TAIL_NUMBER          object 
 7   ORIGIN_AIRPORT       object 
 8   DESTINATION_AIRPORT  object 
 9   SCHEDULED_DEPARTURE  int64  
 10  DEPARTURE_TIME       float64
 11  DEPARTURE_DELAY      float64
 12  TAXI_OUT             float64
 13  WHEELS_OFF           float64
 14  SCHEDULED_TIME       float64
 15  ELAPSED_TIME         float64
 16  AIR_TIME             float64
 17  DISTANCE             int64  
 18  WHEELS_ON            float64
 19  TAXI_IN              float64
 20  SCHEDULED_ARRIVAL    int64  
 21  ARRIVAL_TIME         float64
 22  ARRIVAL_DELAY        float64
 23  DIVERTED             int64  
 24

In [26]:
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,5819079.0,2015.0,0.0,2015.0,2015.0,2015.0,2015.0,2015.0
MONTH,5819079.0,6.524085,3.405137,1.0,4.0,7.0,9.0,12.0
DAY,5819079.0,15.704594,8.783425,1.0,8.0,16.0,23.0,31.0
DAY_OF_WEEK,5819079.0,3.926941,1.988845,1.0,2.0,4.0,6.0,7.0
FLIGHT_NUMBER,5819079.0,2173.092742,1757.063999,1.0,730.0,1690.0,3230.0,9855.0
SCHEDULED_DEPARTURE,5819079.0,1329.60247,483.751821,1.0,917.0,1325.0,1730.0,2359.0
DEPARTURE_TIME,5732926.0,1335.204439,496.42326,1.0,921.0,1330.0,1740.0,2400.0
DEPARTURE_DELAY,5732926.0,9.370158,37.080942,-82.0,-5.0,-2.0,7.0,1988.0
TAXI_OUT,5730032.0,16.071662,8.895574,1.0,11.0,14.0,19.0,225.0
WHEELS_OFF,5730032.0,1357.170841,498.009356,1.0,935.0,1343.0,1754.0,2400.0


In [27]:
alvo = "ARRIVAL_DELAY"

features = [
    "MONTH",
    "DAY",
    "DAY_OF_WEEK",
    "DEPARTURE_DELAY",
    "DISTANCE",
    "AIRLINE",
    "ORIGIN_AIRPORT",
    "DESTINATION_AIRPORT"
]

df_model = df[features + [alvo]].dropna().copy()
df_model.head()


Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,DEPARTURE_DELAY,DISTANCE,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,ARRIVAL_DELAY
0,1,1,4,-11.0,1448,AS,ANC,SEA,-22.0
1,1,1,4,-8.0,2330,AA,LAX,PBI,-9.0
2,1,1,4,-2.0,2296,US,SFO,CLT,5.0
3,1,1,4,-5.0,2342,AA,LAX,MIA,-9.0
4,1,1,4,-1.0,1448,AS,SEA,ANC,-21.0


In [None]:
num_cols = ["MONTH", "DAY", "DAY_OF_WEEK", "DEPARTURE_DELAY", "DISTANCE"]
cat_cols = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"]
df_small = df_model.sample(n=100_000, random_state=100).copy()
X_num = df_small[num_cols]
X_cat = df_small[cat_cols]
X_cat_dummies = pd.get_dummies(X_cat, drop_first=True)

X_full = pd.concat([X_num, X_cat_dummies], axis=1)

# Vetor alvo
y = df_small[alvo].values

print("Shape de X_full:", X_full.shape)
print("Shape de y:", y.shape)


Shape de X_full: (100000, 1195)
Shape de y: (100000,)


In [None]:
def mae(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.mean(np.abs(y_true - y_pred))

def mse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.mean((y_true - y_pred) ** 2)

def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

def r2_score_manual(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - ss_res / ss_tot

In [None]:
def train_test_split_manual(X, y, test_size=0.2, random_state=42):
    rng = np.random.RandomState(random_state)
    n = len(X)
    indices = np.arange(n)
    rng.shuffle(indices)
    
    test_size_int = int(n * test_size)
    test_idx = indices[:test_size_int]
    train_idx = indices[test_size_int:]
    
    if isinstance(X, pd.DataFrame):
        X_train = X.iloc[train_idx].values
        X_test = X.iloc[test_idx].values
    else:
        X_train = X[train_idx]
        X_test = X[test_idx]
    
    y_train = y[train_idx]
    y_test = y[test_idx]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_manual(X_full, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((80000, 1195), (20000, 1195))

In [None]:
def add_bias(X):
    X = np.asarray(X)
    n = X.shape[0]
    bias = np.ones((n, 1))
    return np.hstack([bias, X])


In [None]:
def fit_linear_regression(X, y):
    """
    Ajusta regressão linear múltipla via equação normal.
    X: matriz (n, d)
    y: vetor (n,)
    retorna: vetor de coeficientes beta (d+1,), incluindo intercepto
    """
    Xb = add_bias(X)              # (n, d+1)
    y = y.reshape(-1, 1)          # (n, 1)
    
    XtX = Xb.T @ Xb               # (d+1, d+1)
    Xty = Xb.T @ y                # (d+1, 1)
    
    beta = np.linalg.inv(XtX) @ Xty   # (d+1, 1)
    return beta.flatten()


In [None]:
def predict_linear(X, beta):
    Xb = add_bias(X)
    return Xb @ beta


In [31]:
df_small = df_model.sample(50_000, random_state=100)

In [None]:
def plot_residuals(y_true, y_pred, sample_size=5000):
    temp = pd.DataFrame({"y_true": y_true, "y_pred": y_pred})

    temp = temp.sample(n=min(sample_size, len(temp)), random_state=42)
    
    residuals = temp["y_true"] - temp["y_pred"]

    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=temp["y_pred"], y=residuals, alpha=0.4)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("Predicted Values (sample)")
    plt.ylabel("Residuals (sample)")
    plt.title("Residuals vs Predicted Values (Sample of 5000)")
    plt.show()
