In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [5]:
from src.data import extract_data
from src.utils import init_hydra


cfg = init_hydra("main")
df, _ = extract_data("v7.0", cfg)

In [6]:
df["Cancelled"].value_counts()

Cancelled
False    799398
True      16266
Name: count, dtype: int64

In [None]:
cancelled = df[df["Cancelled"]==True]
on_time = df[df["Cancelled"]==False]

In [None]:
cancelled.shape, on_time.shape

In [None]:
representative_persent = (cancelled.shape[0]*100/on_time.shape[0]) / 100
print(representative_persent) 

In [None]:
on_time = on_time.sample(frac=representative_persent)

In [None]:
import pandas as pd

df = pd.concat([cancelled,on_time])

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
# Plotting the pie chart
df["Cancelled"].value_counts().plot(kind='pie', autopct='%1.1f%%')  # autopct displays the percentage value

# Display the plot
plt.show()

In [None]:
df.columns

# 1. Pull features

In [None]:
from pandas import DataFrame


def pull_features(df: DataFrame, required: list[str]) -> DataFrame:
    """
    Extract only the required features from the dataframe
    """
    # Check that the required columns are there
    for c in required:
        if c not in df.columns:
            raise ValueError(
                f"Dataframe lacks one or more of the required columns: {c}"
            )
    pulled_df = df.copy()
    columns_to_drop = set(df.columns) - set(required)

    pulled_df.drop(list(columns_to_drop), axis=1, inplace=True)

    return pulled_df

In [None]:
required: list[str] = cfg.required
df = pull_features(df, required)

# 2. Drop NaNs

In [None]:
print(df["Cancelled"].value_counts())
print(df.columns)

In [None]:
# 2. Drop NaNs
df.dropna(axis=1, inplace=True)
df.isna().sum().sum()

In [None]:
print(df["Cancelled"].value_counts())
print(df.columns)

# 3. Fix and transfrom cyclic features

In [None]:
import hashlib
import numpy as np

from sklearn.pipeline import FunctionTransformer


def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))


def encode_cyclic_time_data(df: DataFrame, col: str, period: int) -> DataFrame:
    # Check that the column exists
    if col not in df.columns:
        raise ValueError(f"{col} is expected in the dataframe, but not found.")

    # Encode data
    df[col + "_sin"] = sin_transformer(period).fit_transform(df[col])
    df[col + "_cos"] = cos_transformer(period).fit_transform(df[col])

    # df.drop([col], axis=1, inplace=True)

    return df


def fix_hhmm(df: DataFrame, col: str) -> tuple[DataFrame, str, str]:
    # Encoding hours and minutes
    colHH = col + "HH"
    colMM = col + "MM"
    df[colHH] = df[col].apply(lambda hhmm: hhmm // 100)
    df[colMM] = df[col].apply(lambda hhmm: hhmm % 100)

    df.drop([col], axis=1, inplace=True)
    return (df, colHH, colMM)

In [None]:
# Fix hhmm columns
for c in cfg["hhmm"]:
    df, colHH, colMM = fix_hhmm(df, c)
    df = encode_cyclic_time_data(df, colHH, 24)
    df = encode_cyclic_time_data(df, colMM, 60)

In [None]:
# Transform cyclic features
for tf in cfg["time_features"]:
    df = encode_cyclic_time_data(df, tf[0], tf[1])

# 4. Feature crossing

In [None]:
def feature_cross(df: DataFrame, col1: str, col2: str):
    mean = df.groupby(col1)[col2].mean()
    df = df.merge(mean, on=col1, suffixes=("", f"_{col1}Mean"))


feature_cross(df, "DayofMonth", "Distance")
feature_cross(df, "Quarter", "Distance")

In [None]:
"DayofMonth_DistanceMean" in df.columns

# 5. One-hot encoding

In [None]:
# import pandas as pd

# df = pd.get_dummies(df, columns=df.columns[df.dtypes == 'object'])

In [None]:
len(df.columns)

# 6. Hashing

In [None]:
def hash_feature(df: DataFrame, col: str, num_buckets=1000):
    # Hashing with buckets
    df[col] = df[col].map(
        lambda text: int(hashlib.md5(text.encode()).hexdigest(), 16) % num_buckets
    )
    return df

In [None]:
for c in df.columns[df.dtypes == "object"]:
    df = hash_feature(df, c, 10)

In [None]:
# print(df.shape, dff.shape)

In [None]:
# df = pd.concat([df, dff.drop(['Cancelled'], axis=1)], axis=1)
# df

## Taking a random sample, for performance

In [None]:
X = df.drop(["Cancelled"], axis=1)
y = df["Cancelled"]

In [None]:
y.value_counts()

# 6. Scale features

In [None]:
# NOT REQUIRED FOR XGBOOST

# from sklearn.discriminant_analysis import StandardScaler


# sc = StandardScaler()

# df = sc.fit_transform(X)
# df

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    stratify=y,
    random_state=1,
)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

ros = RandomOverSampler(
    random_state=1,
)
rus = RandomUnderSampler(
    random_state=1,
)

In [None]:
print(np.bincount(y_train))

X_train, y_train = ros.fit_resample(X_train, y_train)

print(np.bincount(y_train))

In [None]:
X_train.dtypes.value_counts()

# Leaving only Top Gs

In [None]:
top_gs = [
    "Quarter",
    "Marketing_Airline_Network",
    "DayofMonth",
    "Operated_or_Branded_Code_Share_Partners",
    "Airline",
    "Operating_Airline",
    "OriginWac",
    "DayofMonth_sin",
    "DayOfWeek",
    "DestWac",
    "CRSDepTimeHH",
]

df.drop(
    list(set(df.columns) - set(top_gs)),
    axis=1,
    inplace=True,
)

In [None]:
df

# Training the model

In [None]:
X_train

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1500,random_state=42,max_depth=1500)

rf.fit(X_train.values, y_train.values)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    random_state=42,
    n_estimators=1500,
    learning_rate=0.1,
    max_depth=1500,
    subsample=0.999,
    colsample_bytree=0.9999999,
    tree_method="hist",
    
)
xgb.fit(X_train.values, y_train.values)

In [None]:
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
)
import seaborn as sn


y_pred = rf.predict(X_test.values)
print("Accuracy:", accuracy_score(y_pred, y_test))
print("F1 score: %.3f" % f1_score(y_test, y_pred, average="weighted"))
print("Recall: %.3f" % recall_score(y_test, y_pred, average="weighted"))
print("Precision: %.3f" % precision_score(y_test, y_pred, average="weighted"))
print("AUC Score: %.3f" % roc_auc_score(y_test, y_pred, average="weighted"))

print("Classification report:\n", classification_report(y_test, y_pred))

print("Confusion Matrix:")
conf_mat = confusion_matrix(y_test, y_pred)
sn.heatmap(conf_mat/np.sum(conf_mat), annot=True,  fmt='.2%')

In [None]:
import matplotlib.pyplot as plt

# Accessing feature importances
feature_importances = xgb.feature_importances_

# Pairing feature names with their importance scores
features = X.columns
importance_scores = list(zip(features, feature_importances))

# Sorting the features by importance
sorted_importance_scores = sorted(importance_scores, key=lambda x: x[1], reverse=True)

# Plotting the feature importance graph
plt.figure(figsize=(12, 8))
plt.bar(
    range(len(sorted_importance_scores)),
    [score[1] for score in sorted_importance_scores],
)
plt.xticks(
    range(len(sorted_importance_scores)),
    [score[0] for score in sorted_importance_scores],
    rotation=90,
)
plt.title("Feature Importance")
plt.xlabel("Feature")
plt.ylabel("Importance Score")
plt.tight_layout()
plt.show()

In [None]:
# sorted_importance_scores = [s[0] for s in sorted_importance_scores if s[1] >= 0.02]
sorted_importance_scores

In [None]:
from src.data import extract_data
from src.utils import init_hydra


cfg = init_hydra("main")
df, _ = extract_data("v2.0", cfg)

In [None]:
required: list[str] = cfg.required
df = pull_features(df, required)

df.dropna(axis=1, inplace=True)
df.isna().sum().sum()

import hashlib
import numpy as np

from sklearn.pipeline import FunctionTransformer


def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))


def encode_cyclic_time_data(df: DataFrame, col: str, period: int) -> DataFrame:
    # Check that the column exists
    if col not in df.columns:
        raise ValueError(f"{col} is expected in the dataframe, but not found.")

    # Encode data
    df[col + "_sin"] = sin_transformer(period).fit_transform(df[col])
    df[col + "_cos"] = cos_transformer(period).fit_transform(df[col])

    # df.drop([col], axis=1, inplace=True)

    return df


def fix_hhmm(df: DataFrame, col: str) -> tuple[DataFrame, str, str]:
    # Encoding hours and minutes
    colHH = col + "HH"
    colMM = col + "MM"
    df[colHH] = df[col].apply(lambda hhmm: hhmm // 100)
    df[colMM] = df[col].apply(lambda hhmm: hhmm % 100)

    df.drop([col], axis=1, inplace=True)
    return (df, colHH, colMM)


# Fix hhmm columns
for c in cfg["hhmm"]:
    df, colHH, colMM = fix_hhmm(df, c)
    df = encode_cyclic_time_data(df, colHH, 24)
    df = encode_cyclic_time_data(df, colMM, 60)
    
    
# Transform cyclic features
for tf in cfg["time_features"]:
    df = encode_cyclic_time_data(df, tf[0], tf[1])
    
    



for c in df.columns[df.dtypes == "object"]:
    df = hash_feature(df, c, 10)
    
# top_gs = [
#     "Quarter",
#     "Marketing_Airline_Network",
#     "DayofMonth",
#     "Operated_or_Branded_Code_Share_Partners",
#     "Airline",
#     "Operating_Airline",
#     "OriginWac",
#     "DayofMonth_sin",
#     "DayOfWeek",
#     "DestWac",
#     "CRSDepTimeHH",
#     "Cancelled"
# ]

# df.drop(
#     list(set(df.columns) - set(top_gs)),
#     axis=1,
#     inplace=True,
# )
    
X = df.drop(["Cancelled"], axis=1)
y = df["Cancelled"]

In [None]:
X.shape

In [None]:

y_pred = rf.predict(X.values)
print("Accuracy:", accuracy_score(y_pred, y))
print("F1 score: %.3f" % f1_score(y, y_pred, average="weighted"))
print("Recall: %.3f" % recall_score(y, y_pred, average="weighted"))
print("Precision: %.3f" % precision_score(y, y_pred, average="weighted"))
print("AUC Score: %.3f" % roc_auc_score(y, y_pred, average="weighted"))

print("Classification report:\n", classification_report(y, y_pred))

print("Confusion Matrix:")
conf_mat = confusion_matrix(y, y_pred)
sn.heatmap(conf_mat/np.sum(conf_mat), annot=True,  fmt='.2%')