In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# LOAD DATA
dataset = pd.read_csv("preprocessed_data_flight.csv")
df = dataset.copy()


# CREATE TARGET COLUMN

df['classification_yes'] = ((df['SUM_YR_1'] + df['SUM_YR_2']) > 
                            (df['SUM_YR_1'] + df['SUM_YR_2']).median()).astype(int)


# CONVERT DATES TO NUMERIC

date_cols = ['FFP_DATE', 'FIRST_FLIGHT_DATE', 'LAST_FLIGHT_DATE']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')  # convert to datetime
    df[col] = (df[col] - pd.Timestamp("1970-01-01")).dt.days  # convert to numeric days


# DROP NON-NUMERIC / ID COLUMNS

drop_cols = ['MEMBER_NO', 'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY']
df = df.drop(columns=drop_cols)


# ONE-HOT ENCODE LOW-CARDINALITY CATEGORICALS

cat_cols = ['GENDER', 'FFP_TIER']
for col in cat_cols:
    df[col] = df[col].astype(str)
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


# FORCE ALL COLUMNS TO NUMERIC

df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)  # replace any remaining NaN with 0


# INDEPENDENT & DEPENDENT VARIABLES

indep_X = df.drop(columns=['classification_yes'])
dep_Y = df['classification_yes']

# =============================
# RFE FUNCTION
# =============================
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []

    log_model = LogisticRegression(solver='lbfgs', max_iter=1000)
    RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt', random_state=0)
    svc_model = SVC(kernel='linear', random_state=0)

    rfemodellist = [log_model, svc_model, RF, DT]

    sc = StandardScaler()
    X_scaled = sc.fit_transform(indep_X)

    for model in rfemodellist:
        print("Running RFE for:", model)
        rfe = RFE(estimator=model, n_features_to_select=n)
        rfe.fit(X_scaled, dep_Y)
        rfelist.append(rfe.transform(X_scaled))

    return rfelist

# =============================
# SPLIT & SCALE FUNCTION
# =============================
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(
        indep_X, dep_Y, test_size=0.25, random_state=0
    )
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# =============================
# CONFUSION MATRIX FUNCTION
# =============================
def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, Accuracy, report, X_test, y_test, cm

# =============================
# MODEL FUNCTIONS
# =============================
def logistic(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0, max_iter=1000)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_NL(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def Navie(X_train, y_train, X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def Decision(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def random(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

# =============================
# RFE CLASSIFICATION RESULT FUNCTION
# =============================
def rfe_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf): 
    rfedataframe = pd.DataFrame(
        index=['Logistic','SVC','Random','DecisionTree'],
        columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random']
    )
    for number, idx in enumerate(rfedataframe.index):
        rfedataframe['Logistic'][idx] = acclog[number]       
        rfedataframe['SVMl'][idx] = accsvml[number]
        rfedataframe['SVMnl'][idx] = accsvmnl[number]
        rfedataframe['KNN'][idx] = accknn[number]
        rfedataframe['Navie'][idx] = accnav[number]
        rfedataframe['Decision'][idx] = accdes[number]
        rfedataframe['Random'][idx] = accrf[number]
    return rfedataframe

# =============================
# RUN RFE + MODELS
# =============================
rfelist = rfeFeature(indep_X, dep_Y, 7)

acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []

for i in rfelist:
    X_train, X_test, y_train, y_test = split_scalar(i, dep_Y)
    
    _, Accuracy, _, _, _, _ = logistic(X_train, y_train, X_test, y_test)
    acclog.append(Accuracy)
    
    _, Accuracy, _, _, _, _ = svm_linear(X_train, y_train, X_test, y_test)
    accsvml.append(Accuracy)
    
    _, Accuracy, _, _, _, _ = svm_NL(X_train, y_train, X_test, y_test)
    accsvmnl.append(Accuracy)
    
    _, Accuracy, _, _, _, _ = knn(X_train, y_train, X_test, y_test)
    accknn.append(Accuracy)
    
    _, Accuracy, _, _, _, _ = Navie(X_train, y_train, X_test, y_test)
    accnav.append(Accuracy)
    
    _, Accuracy, _, _, _, _ = Decision(X_train, y_train, X_test, y_test)
    accdes.append(Accuracy)
    
    _, Accuracy, _, _, _, _ = random(X_train, y_train, X_test, y_test)
    accrf.append(Accuracy)

# =============================
# SHOW RESULTS
# =============================
result = rfe_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
print(result)

Running RFE for: LogisticRegression(max_iter=1000)
Running RFE for: SVC(kernel='linear', random_state=0)
Running RFE for: RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
Running RFE for: DecisionTreeClassifier(max_features='sqrt', random_state=0)
              Logistic      SVMl     SVMnl       KNN     Navie  Decision  \
Logistic      0.994539  0.997714  0.992888  0.973265  0.904807  0.994412   
SVC           0.994666  0.997777  0.994285  0.979742  0.903347  0.993586   
Random        0.994856  0.998031  0.993777  0.979298  0.902013  0.993142   
DecisionTree  0.995428  0.997968  0.994856  0.983997  0.895536  0.993523   

                Random  
Logistic      0.990347  
SVC           0.988188  
Random        0.987744  
DecisionTree  0.986601  


In [4]:
print(result)

              Logistic      SVMl     SVMnl       KNN     Navie  Decision  \
Logistic      0.994539  0.997714  0.992888  0.973265  0.904807  0.994412   
SVC           0.994666  0.997777  0.994285  0.979742  0.903347  0.993586   
Random        0.994856  0.998031  0.993777  0.979298  0.902013  0.993142   
DecisionTree  0.995428  0.997968  0.994856  0.983997  0.895536  0.993523   

                Random  
Logistic      0.990347  
SVC           0.988188  
Random        0.987744  
DecisionTree  0.986601  


In [14]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# =============================
# PREPROCESS DATA (same as training)
# =============================
dataset = pd.read_csv("preprocessed_data_flight.csv")
df = dataset.copy()

# CREATE TARGET
df['classification_yes'] = ((df['SUM_YR_1'] + df['SUM_YR_2']) > 
                            (df['SUM_YR_1'] + df['SUM_YR_2']).median()).astype(int)

# CONVERT DATES
date_cols = ['FFP_DATE', 'FIRST_FLIGHT_DATE', 'LAST_FLIGHT_DATE']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = (df[col] - pd.Timestamp("1970-01-01")).dt.days

# DROP NON-NUMERIC / ID COLUMNS
drop_cols = ['MEMBER_NO', 'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY']
df = df.drop(columns=drop_cols)

# ONE-HOT ENCODE
cat_cols = ['GENDER', 'FFP_TIER']
for col in cat_cols:
    df[col] = df[col].astype(str)
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# FORCE NUMERIC + fill NaN
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

# INDEPENDENT & DEPENDENT VARIABLES
X = df.drop(columns=['classification_yes'])
y = df['classification_yes']

# =============================
# SELECT TOP FEATURES USING RFE
# =============================
log_model = LogisticRegression(max_iter=1000)
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

rfe = RFE(estimator=log_model, n_features_to_select=7)
rfe.fit(X_scaled, y)

# Save selected features names
selected_features = X.columns[rfe.support_].tolist()
print("Selected Features:", selected_features)

# Keep only selected features for deployment
X_selected = X[selected_features]

# =============================
# TRAIN FINAL MODEL
# =============================
final_scaler = StandardScaler()
X_final_scaled = final_scaler.fit_transform(X_selected)

final_model = LogisticRegression(max_iter=1000)
final_model.fit(X_final_scaled, y)

# =============================
# SAVE FOR DEPLOYMENT
# =============================
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(final_scaler, f)

with open("selected_features.pkl", "wb") as f:
    pickle.dump(selected_features, f)

print("Deployment files saved: logistic_model.pkl, scaler.pkl, selected_features.pkl")

Selected Features: ['FLIGHT_COUNT', 'SUM_YR_1', 'SUM_YR_2', 'SEG_KM_SUM', 'LAST_TO_END', 'avg_discount', 'Points_Sum']
Deployment files saved: logistic_model.pkl, scaler.pkl, selected_features.pkl


In [15]:
import pickle
import pandas as pd

# LOAD DEPLOYMENT FILES
with open("logistic_model.pkl", "rb") as f:
    model = pickle.load(f)

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("selected_features.pkl", "rb") as f:
    features = pickle.load(f)

# LOAD NEW DATA
new_data = pd.read_csv("preprocessed_data_flight.csv")

# PREPROCESS SAME WAY
date_cols = ['FFP_DATE', 'FIRST_FLIGHT_DATE', 'LAST_FLIGHT_DATE']
for col in date_cols:
    new_data[col] = pd.to_datetime(new_data[col], errors='coerce')
    new_data[col] = (new_data[col] - pd.Timestamp("1970-01-01")).dt.days

drop_cols = ['MEMBER_NO', 'WORK_CITY', 'WORK_PROVINCE', 'WORK_COUNTRY']
new_data = new_data.drop(columns=drop_cols, errors='ignore')

cat_cols = ['GENDER', 'FFP_TIER']
for col in cat_cols:
    if col in new_data.columns:
        new_data[col] = new_data[col].astype(str)

new_data = pd.get_dummies(new_data, columns=cat_cols, drop_first=True)
new_data = new_data.apply(pd.to_numeric, errors='coerce').fillna(0)

# KEEP ONLY SELECTED FEATURES
X_new = new_data.reindex(columns=features, fill_value=0)

# SCALE AND PREDICT
X_new_scaled = scaler.transform(X_new)
predictions = model.predict(X_new_scaled)

print("Predictions:", predictions)

Predictions: [1 1 1 ... 0 0 0]
