# **Logistic Regression for Flight Delay Prediction**

## **Setup and Preprocessing**

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the data

In [2]:
data_path = 'flight_data_new.csv'
data = pd.read_csv(data_path)

# Define the features and target variable

In [3]:
X = data[['DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'WHEELS_OFF_elapse', 'WHEELS_ON_elapse', 'OP_CARRIER_Delta_Airlines', 'MONTH_Nov']]
y = data['FLIGHT_STATUS']  # Assuming FLIGHT_STATUS is the target (0 or 1)

# Split the data into training and testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model Training and Prediction**

# Initialize and train the logistic regression model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set

In [6]:
y_pred = model.predict(X_test)

## **Evaluation**

# Evaluate the model

In [7]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the results

In [None]:
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(report)

## **Delay Probabilities**

# Calculate and display delay probabilities

In [None]:
delay_probabilities = model.predict_proba(X_test)[:, 1]
print("\nDelay Probabilities:\n", delay_probabilities)

In [None]:
import pandas as pd
from sortedcontainers import SortedDict

# data load
data_path = 'flight_data_new.csv'  # CSV file path
try:
    data = pd.read_csv(data_path)
except FileNotFoundError:
    print(f"File not found: {data_path}")
    exit()

# BalancedBST class definition
class BalancedBST:
    def __init__(self):
        self.tree = SortedDict()

    def insert(self, key, data):
        if key in self.tree:
            self.tree[key].append(data)
        else:
            self.tree[key] = [data]

    def range_query(self, low, high):
        results = []
        for key in self.tree.irange(low, high):
            results.extend(self.tree[key])
        return results

# BalancedBST instance generation
bst = BalancedBST()

# code to map airline names to numbers
airlines = {
    1: 'OP_CARRIER_American_Airlines',
    2: 'OP_CARRIER_Delta_Airlines',
    3: 'OP_CARRIER_Alaska_Airlines',
    4: 'OP_CARRIER_Southwest _irlines',
    5: 'OP_CARRIER_United_Airlines',
    # add more airlines as needed
}

# Insert DEP_DELAY data into the tree (using delay time as the key and data row as the value)
for _, row in data.iterrows():
    bst.insert(row['DEP_DELAY'], row)

# print available airlines
print("Available airlines and their corresponding numbers:")
for code, name in airlines.items():
    print(f"{code}: {name.replace('OP_CARRIER_', '').replace('_', ' ')}")

# choose an airline
try:
    airline_choice = int(input("\nEnter the number corresponding to the airline: "))
    selected_airline = airlines.get(airline_choice)
    if not selected_airline:
        raise KeyError
    print(f"You selected airline number {airline_choice}: {selected_airline.replace('OP_CARRIER_', '').replace('_', ' ')}")
except (ValueError, KeyError):
    print("Invalid input! Please enter a valid airline number.")
    exit()

# choose delay range
try:
    low_delay = int(input("Enter the minimum delay time (DEP_DELAY): "))
    high_delay = int(input("Enter the maximum delay time (DEP_DELAY): "))
    print(f"You selected a delay range between {low_delay} and {high_delay} minutes.")
except ValueError:
    print("Invalid input! Please enter numeric values.")
    exit()

# Range search and result filtering
if selected_airline:
    # search for data within the specified delay range
    results = bst.range_query(low_delay, high_delay)

    # data filtering based on the selected airline
    filtered_results = [
        result for result in results if result[selected_airline] == 1
    ]

    # print filtered data
    if filtered_results:
        print(f"\nFiltered data for {selected_airline.replace('OP_CARRIER_', '').replace('_', ' ')} with delay range {low_delay}-{high_delay} minutes:")
        for flight in filtered_results[:5]:  # 상위 5개 출력
            print({
                'TAXI_OUT': flight['TAXI_OUT'],
                'TAXI_IN': flight['TAXI_IN'],
                'ACTUAL_ELAPSED_TIME': flight['ACTUAL_ELAPSED_TIME'],
                'DISTANCE': flight['DISTANCE'],
                'WHEELS_OFF_elapse': flight['WHEELS_OFF_elapse'],
                'WHEELS_ON_elapse': flight['WHEELS_ON_elapse']
            })
    else:
        print("No data found for the selected criteria.")
else:
    print("Invalid airline selection. Please try again.")
