### Import Libraries

In [14]:
# ==========================
# 1. Imports
# ==========================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# ==========================
# 2. Load dataset
# ==========================
df = pd.read_csv("preprocessed_data_flight.csv")

# ==========================
# 3. Drop columns we cannot use
# ==========================
# Drop all known date columns and ID columns
drop_cols = ['MEMBER_NO', 'FFP_DATE', 'FIRST_FLIGHT_DATE', 'LAST_FLIGHT_DATE']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ==========================
# 4. One-hot encode categorical columns
# ==========================
categorical_cols = ['GENDER', 'FFP_TIER', 'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY']
existing_cats = [c for c in categorical_cols if c in df.columns]
df = pd.get_dummies(df, columns=existing_cats, drop_first=True)

# ==========================
# 5. Create target column
# ==========================
threshold = 100000
df['classification'] = ((df['SUM_YR_1'] + df['SUM_YR_2']) > threshold).astype(int)

# ==========================
# 6. Keep only numeric columns for X
# ==========================
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'classification']

X = df[numeric_cols]
y = df['classification']

# ==========================
# 7. Feature selection
# ==========================
def selectkbest_features(X, y, k=20):
    test = SelectKBest(score_func=chi2, k=k)
    X_new = test.fit_transform(X, y)
    return X_new

# ==========================
# 8. Split and scale
# ==========================
def split_and_scale(X, y, test_size=0.25, random_state=0):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# ==========================
# 9. Evaluation
# ==========================
def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return acc, report, cm

# ==========================
# 10. Classifier functions
# ==========================
def logistic(X_train, y_train, X_test, y_test):
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

def svm_linear(X_train, y_train, X_test, y_test):
    clf = SVC(kernel='linear', random_state=0)
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

def svm_nonlinear(X_train, y_train, X_test, y_test):
    clf = SVC(kernel='rbf', random_state=0)
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

def naive_bayes(X_train, y_train, X_test, y_test):
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

def knn(X_train, y_train, X_test, y_test):
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

def decision_tree(X_train, y_train, X_test, y_test):
    clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

def random_forest(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    clf.fit(X_train, y_train)
    acc, report, cm = cm_prediction(clf, X_test, y_test)
    return clf, acc, report, cm

# ==========================
# 11. Run pipeline
# ==========================
X_selected = selectkbest_features(X, y, k=10)
X_train, X_test, y_train, y_test = split_and_scale(X_selected, y)

models = {}
models['Logistic'] = logistic(X_train, y_train, X_test, y_test)
models['SVM_linear'] = svm_linear(X_train, y_train, X_test, y_test)
models['SVM_nonlinear'] = svm_nonlinear(X_train, y_train, X_test, y_test)
models['Naive_Bayes'] = naive_bayes(X_train, y_train, X_test, y_test)
models['KNN'] = knn(X_train, y_train, X_test, y_test)
models['Decision_Tree'] = decision_tree(X_train, y_train, X_test, y_test)
models['Random_Forest'] = random_forest(X_train, y_train, X_test, y_test)
# ==========================
# 12. Summary Table â€“ proper format
# ==========================

# Map the model keys to the column names you want
model_map = {
    'Logistic': 'Logistic',
    'SVM_linear': 'SVMl',
    'SVM_nonlinear': 'SVMnl',
    'KNN': 'KNN',
    'Naive_Bayes': 'Navie',
    'Decision_Tree': 'Decision',
    'Random_Forest': 'Random'
}

# Create the summary table in one line
summary = pd.DataFrame(
    [[models[k][1] for k in model_map.keys()]],  # take accuracy for each model
    columns=model_map.values(),
    index=['ChiSquare']
)

# Round to 2 decimals
summary = summary.round(2)

print(summary)

           Logistic  SVMl  SVMnl  KNN  Navie  Decision  Random
ChiSquare       1.0   1.0    1.0  1.0   0.97       1.0     1.0


In [18]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2

# Load full dataset
df = pd.read_csv("preprocessed_data_flight.csv")

# Drop unnecessary columns
drop_cols = ['MEMBER_NO', 'FFP_DATE', 'FIRST_FLIGHT_DATE', 'LAST_FLIGHT_DATE']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Encode categorical columns
categorical_cols = ['GENDER', 'FFP_TIER', 'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY']
existing_cats = [c for c in categorical_cols if c in df.columns]
df = pd.get_dummies(df, columns=existing_cats, drop_first=True)

# Create target column
threshold = 100000
df['classification'] = ((df['SUM_YR_1'] + df['SUM_YR_2']) > threshold).astype(int)

# Separate X and y
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'classification']

X = df[numeric_cols]
y = df['classification']

# Select top K features
k_features = 10
selector = SelectKBest(score_func=chi2, k=k_features)
selector.fit(X, y)
selected_features = X.columns[selector.get_support()].tolist()

# Keep only selected features + target
deployment_df = df[selected_features + ['classification']]

# Save CSV for deployment
deployment_df.to_csv("deployment_ready_flight.csv", index=False)
print("Clean deployment CSV saved as 'deployment_ready_flight.csv'")
print("Selected features:", selected_features)

Clean deployment CSV saved as 'deployment_ready_flight.csv'
Selected features: ['FLIGHT_COUNT', 'BP_SUM', 'SUM_YR_1', 'SUM_YR_2', 'SEG_KM_SUM', 'LAST_TO_END', 'MAX_INTERVAL', 'EXCHANGE_COUNT', 'Points_Sum', 'Point_NotFlight']
