In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# 0) Load the dataset produced by the "missing data" step

DATA_PATH = Path(r"C:\Users\Abdur Rahman\AIML_PROJECT\data\after_missing.csv")
VIZ_DIR   = Path(r"C:\Users\Abdur Rahman\AIML_PROJECT\eda_visualizations")
OUT_DIR   = Path(r"C:\Users\Abdur Rahman\AIML_PROJECT\outputs")

df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH, "shape:", df.shape)


# 1) Date features from FL_DATE

if "FL_DATE" in df.columns:
    df["FL_DATE"] = pd.to_datetime(df["FL_DATE"], errors="coerce")
    df["YEAR"] = df["FL_DATE"].dt.year
    df["MONTH"] = df["FL_DATE"].dt.month
    df["DAY_OF_WEEK"] = df["FL_DATE"].dt.dayofweek  # Monday=0 … Sunday=6
    df = df.drop(columns=["FL_DATE"])  # drop raw date if not needed

# 2) Categorical columns to encode

cat_cols = [c for c in ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"] if c in df.columns]
print("Categorical columns found:", cat_cols)

# Helper: one-hot ALL categories of a column
def one_hot_all(data: pd.DataFrame, col: str, prefix: str = None):
    if prefix is None:
        prefix = col
    dummies = pd.get_dummies(data[col].astype("category"), prefix=prefix)
    data = pd.concat([data.drop(columns=[col]), dummies], axis=1)
    return data

# Helper: one-hot TOP-K categories; rest → OTHER
def one_hot_topk(data: pd.DataFrame, col: str, k: int = 30, prefix: str = None):
    if prefix is None:
        prefix = col
    counts = data[col].value_counts()
    top = counts.head(k).index
    data[col] = np.where(data[col].isin(top), data[col], "OTHER")
    dummies = pd.get_dummies(data[col].astype("category"), prefix=prefix)
    data = pd.concat([data.drop(columns=[col]), dummies], axis=1)
    return data


# 3) Apply encoding strategies

if "OP_UNIQUE_CARRIER" in cat_cols:
    df = one_hot_all(df, "OP_UNIQUE_CARRIER", prefix="CARRIER")

TOP_K_AIRPORTS = 30
for col in ["ORIGIN", "DEST"]:
    if col in cat_cols:
        df = one_hot_topk(df, col, k=TOP_K_AIRPORTS, prefix=col)

print("After encoding shape:", df.shape)


# 4) Quick EDA: airline distribution

carrier_cols = [c for c in df.columns if c.startswith("CARRIER_")]
if carrier_cols:
    shares = df[carrier_cols].sum().sort_values(ascending=False)
    plt.figure(figsize=(10, 5))
    shares.head(15).plot(kind="bar")
    plt.title("Top Airlines by Share (after encoding)")
    plt.ylabel("Row count (approx. flights)")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()

    VIZ_DIR.mkdir(parents=True, exist_ok=True)
    viz_path = VIZ_DIR / "airline_distribution.png"
    plt.savefig(viz_path)
    plt.show()
    print("Saved visualization ->", viz_path)


# 5) Save for the next step

OUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUT_DIR / "after_encoding.csv"
df.to_csv(out_path, index=False)
print("Saved encoded dataset ->", out_path)