# **Advanced Flight Data Management Using Algorithms and Data Structures**

## **Setup and Preprocessing**

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the data

In [2]:
data_path = 'flight_data_new.csv'
data = pd.read_csv(data_path)

# Define the features and target variable

In [3]:
X = data[['DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'WHEELS_OFF_elapse', 'WHEELS_ON_elapse', 'OP_CARRIER_Delta_Airlines', 'MONTH_Nov']]
y = data['FLIGHT_STATUS']  # Assuming FLIGHT_STATUS is the target (0 or 1)

# Split the data into training and testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Logistic Regression for Delay Prediction**

# Initialize and train the logistic regression model

In [5]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Make predictions on the test set

In [6]:
y_pred = model.predict(X_test)

# Evaluate the model

In [7]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the results

In [8]:
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(report)


Accuracy: 0.96

Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     65443
           1       0.91      0.83      0.87     11520

    accuracy                           0.96     76963
   macro avg       0.94      0.91      0.92     76963
weighted avg       0.96      0.96      0.96     76963



# Calculate and display delay probabilities

In [9]:
delay_probabilities = model.predict_proba(X_test)[:, 1]
print("\nDelay Probabilities:\n", delay_probabilities)


Delay Probabilities:
 [4.49559028e-04 3.36643811e-02 1.05896478e-03 ... 9.38759141e-01
 3.21901044e-02 6.44650099e-04]


## **Advanced Data Structures and Algorithms**

### **1. Trees**

# Example: Binary Search Tree for managing delay probabilities

In [10]:
class TreeNode:
    def __init__(self, key):
        self.left = None
        self.right = None
        self.val = key

class BinarySearchTree:
    def __init__(self):
        self.root = None

    def insert(self, root, key):
        if root is None:
            return TreeNode(key)
        if key < root.val:
            root.left = self.insert(root.left, key)
        else:
            root.right = self.insert(root.right, key)
        return root

    def inorder(self, root):
        if root:
            self.inorder(root.left)
            print(root.val, end=" ")
            self.inorder(root.right)


# Insert delay probabilities into a BST

In [11]:
bst = BinarySearchTree()
bst.root = None
for prob in delay_probabilities:
    bst.root = bst.insert(bst.root, prob)

print("\nInorder Traversal of Delay Probabilities:")
bst.inorder(bst.root)


Inorder Traversal of Delay Probabilities:
1.2052036178648323e-07 7.109885241253075e-07 7.850906276832842e-07 9.492778886309136e-07 1.1378619830875552e-06 1.404900693322376e-06 1.5479299990333726e-06 1.7584985403484723e-06 1.9464004755197113e-06 2.1986634535188747e-06 2.2933169259302175e-06 2.449697164221779e-06 2.895354676056149e-06 2.9097149177754253e-06 2.999738253996834e-06 3.022540942933428e-06 3.069004926506353e-06 3.294498633347974e-06 3.411912111095299e-06 3.519025354076052e-06 3.608750887431268e-06 3.699491206756292e-06 3.737241740560001e-06 3.833485176854299e-06 3.884152244294437e-06 3.908071900280679e-06 4.00658042039948e-06 4.025254557906152e-06 4.030735680535746e-06 4.064987786900342e-06 4.133257156515499e-06 4.310989848356286e-06 4.328778362650392e-06 4.402006900564723e-06 4.513294283804242e-06 4.565244870530716e-06 4.902492396122e-06 4.948650973380345e-06 5.005505049858985e-06 5.130384207459947e-06 5.1741107002249216e-06 5.2647206729125655e-06 5.319469401541214e-06 5.397

### **2. Priority Queues and Heaps**

In [12]:
import heapq

# Example: Using a heap to manage the top 10 highest delay probabilities

In [13]:
delay_heap = []
for prob in delay_probabilities:
    heapq.heappush(delay_heap, prob)
    if len(delay_heap) > 10:
        heapq.heappop(delay_heap)

print("\nTop 10 Highest Delay Probabilities:")
print(sorted(delay_heap, reverse=True))


Top 10 Highest Delay Probabilities:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


### **3. Maps and Hash Tables**

# Example: Using a dictionary to count occurrences of delay probability ranges

In [14]:
delay_counts = {}
for prob in delay_probabilities:
    range_key = f"{int(prob * 10)}0-{int(prob * 10) + 1}0%"
    delay_counts[range_key] = delay_counts.get(range_key, 0) + 1

print("\nDelay Probability Ranges:")
for key, count in delay_counts.items():
    print(f"{key}: {count}")



Delay Probability Ranges:
00-10%: 61342
90-100%: 6860
30-40%: 885
10-20%: 2277
50-60%: 645
40-50%: 687
100-110%: 957
80-90%: 821
20-30%: 1239
60-70%: 602
70-80%: 648


### **4. Sorting Algorithms**

# Example: Sorting delay probabilities

In [15]:
sorted_probabilities = sorted(delay_probabilities)
print("\nSorted Delay Probabilities:")
print(sorted_probabilities[:10], "...", sorted_probabilities[-10:])


Sorted Delay Probabilities:
[1.2052036178648323e-07, 7.109885241253075e-07, 7.850906276832842e-07, 9.492778886309136e-07, 1.1378619830875552e-06, 1.404900693322376e-06, 1.5479299990333726e-06, 1.7584985403484723e-06, 1.9464004755197113e-06, 2.1986634535188747e-06] ... [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


### **5. Graph Algorithms**

# Example: Representing flight routes as a graph and finding the shortest path

In [16]:
class Graph:
    def __init__(self):
        self.graph = {}

    def add_edge(self, u, v, weight):
        if u not in self.graph:
            self.graph[u] = []
        if v not in self.graph:  # Ensure all nodes are initialized
            self.graph[v] = []
        self.graph[u].append((v, weight))

    def shortest_path(self, start):
        import heapq
        distances = {node: float('inf') for node in self.graph}
        distances[start] = 0
        pq = [(0, start)]

        while pq:
            current_distance, current_node = heapq.heappop(pq)

            if current_distance > distances[current_node]:
                continue

            for neighbor, weight in self.graph[current_node]:
                distance = current_distance + weight

                if distance < distances[neighbor]:
                    distances[neighbor] = distance
                    heapq.heappush(pq, (distance, neighbor))

        return distances

# Example graph setup
flight_graph = Graph()
flight_graph.add_edge("A", "B", 100)
flight_graph.add_edge("A", "C", 300)
flight_graph.add_edge("B", "C", 200)
flight_graph.add_edge("C", "D", 100)

print("\nShortest Paths from A:")
print(flight_graph.shortest_path("A"))



Shortest Paths from A:
{'A': 0, 'B': 100, 'C': 300, 'D': 400}
