# **Logistic Regression for Flight Delay Prediction**

## **Setup and Preprocessing**

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the data

In [2]:
data_path = 'flight_data_new.csv'
data = pd.read_csv(data_path)

# Define the features and target variable

In [3]:
X = data[['DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'WHEELS_OFF_elapse', 'WHEELS_ON_elapse', 'OP_CARRIER_Delta_Airlines', 'MONTH_Nov']]
y = data['FLIGHT_STATUS']  # Assuming FLIGHT_STATUS is the target (0 or 1)

# Split the data into training and testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model Training and Prediction**

# Initialize and train the logistic regression model

In [5]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Make predictions on the test set

In [6]:
y_pred = model.predict(X_test)

## **Evaluation**

# Evaluate the model

In [7]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the results

In [8]:
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(report)

Accuracy: 0.96

Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     65443
           1       0.91      0.83      0.87     11520

    accuracy                           0.96     76963
   macro avg       0.94      0.91      0.92     76963
weighted avg       0.96      0.96      0.96     76963



## **Delay Probabilities**

# Calculate and display delay probabilities

In [9]:
delay_probabilities = model.predict_proba(X_test)[:, 1]
print("\nDelay Probabilities:\n", delay_probabilities)


Delay Probabilities:
 [4.49559028e-04 3.36643811e-02 1.05896478e-03 ... 9.38759141e-01
 3.21901044e-02 6.44650099e-04]


In [None]:
import pandas as pd
from sortedcontainers import SortedDict

# 데이터 로드
data_path = 'flight_data_new.csv'  # CSV 파일 경로
try:
    data = pd.read_csv(data_path)
except FileNotFoundError:
    print(f"File not found: {data_path}")
    exit()

# BalancedBST 클래스 정의
class BalancedBST:
    def __init__(self):
        self.tree = SortedDict()

    def insert(self, key, data):
        if key in self.tree:
            self.tree[key].append(data)
        else:
            self.tree[key] = [data]

    def range_query(self, low, high):
        results = []
        for key in self.tree.irange(low, high):
            results.extend(self.tree[key])
        return results

# BalancedBST 객체 생성
bst = BalancedBST()

# 항공사 코드와 이름 매핑
airlines = {
    1: 'OP_CARRIER_American_Airlines',
    2: 'OP_CARRIER_Delta_Airlines',
    3: 'OP_CARRIER_Alaska_Airlines',
    4: 'OP_CARRIER_Southwest _irlines',
    5: 'OP_CARRIER_United_Airlines',
    # 필요한 경우 추가 항공사 추가
}

# DEP_DELAY 데이터를 트리에 삽입 (지연 시간을 키로, 데이터 행을 값으로 삽입)
for _, row in data.iterrows():
    bst.insert(row['DEP_DELAY'], row)

# 각 항공사와 번호 출력
print("Available airlines and their corresponding numbers:")
for code, name in airlines.items():
    print(f"{code}: {name.replace('OP_CARRIER_', '').replace('_', ' ')}")

# 항공사 선택
try:
    airline_choice = int(input("\nEnter the number corresponding to the airline: "))
    selected_airline = airlines.get(airline_choice)
    if not selected_airline:
        raise KeyError
    print(f"You selected airline number {airline_choice}: {selected_airline.replace('OP_CARRIER_', '').replace('_', ' ')}")
except (ValueError, KeyError):
    print("Invalid input! Please enter a valid airline number.")
    exit()

# 딜레이 시간 범위 선택
try:
    low_delay = int(input("Enter the minimum delay time (DEP_DELAY): "))
    high_delay = int(input("Enter the maximum delay time (DEP_DELAY): "))
    print(f"You selected a delay range between {low_delay} and {high_delay} minutes.")
except ValueError:
    print("Invalid input! Please enter numeric values.")
    exit()

# 범위 검색 및 결과 필터링
if selected_airline:
    # 범위 검색
    results = bst.range_query(low_delay, high_delay)

    # 선택한 항공사에 해당하는 데이터 필터링
    filtered_results = [
        result for result in results if result[selected_airline] == 1
    ]

    # 결과 출력
    if filtered_results:
        print(f"\nFiltered data for {selected_airline.replace('OP_CARRIER_', '').replace('_', ' ')} with delay range {low_delay}-{high_delay} minutes:")
        for flight in filtered_results[:5]:  # 상위 5개 출력
            print({
                'TAXI_OUT': flight['TAXI_OUT'],
                'TAXI_IN': flight['TAXI_IN'],
                'ACTUAL_ELAPSED_TIME': flight['ACTUAL_ELAPSED_TIME'],
                'DISTANCE': flight['DISTANCE'],
                'WHEELS_OFF_elapse': flight['WHEELS_OFF_elapse'],
                'WHEELS_ON_elapse': flight['WHEELS_ON_elapse']
            })
    else:
        print("No data found for the selected criteria.")
else:
    print("Invalid airline selection. Please try again.")


Available airlines and their corresponding numbers:
1: American Airlines
2: Delta Airlines
3: Alaska Airlines
4: Southwest  irlines
5: United Airlines
Invalid input! Please enter a valid airline number.
Invalid input! Please enter numeric values.


NameError: name 'selected_airline' is not defined

: 