In [1]:
#imports

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans

In [3]:
#read flight data CSV files, calculates changes and acceleration values, labels flight phases (taxi, climb, cruise), runs KMeans clustering, and saves the processed files to a new folder.


In [None]:
# Title: Chunked Phase Labeling from Combined CSV

# Description:
# Reads a large flight dataset in 1M-row chunks,
# applies altitude-based phase classification rules (taxi, climb, cruise, descent),
# and saves each processed chunk to a separate CSV for modular analysis.

In [None]:

chunk_size = 1_000_000
input_file = 'output/combined_all.csv'
output_folder = 'output/processed_chunks'
os.makedirs(output_folder, exist_ok=True)

chunk_number = 0

for chunk in pd.read_csv(input_file, chunksize=chunk_size):
    # Apply phase rules to each chunk
    chunk['ALT_diff'] = chunk['ALT'].diff()
    chunk['ALT_diff_smooth'] = chunk['ALT_diff'].rolling(window=5, center=True, min_periods=1).mean()
    chunk['ALT_smooth'] = chunk['ALT'].rolling(window=5, center=True, min_periods=1).mean()
    
    chunk['phase_refined'] = 'unknown'
    chunk.loc[(chunk['ALT_smooth'] < 1000) & (abs(chunk['ALT_diff_smooth']) < 10), 'phase_refined'] = 'taxi'
    chunk.loc[(chunk['ALT_smooth'] >= 1000) & (chunk['ALT_smooth'] < 10000) & (chunk['ALT_diff_smooth'] > 10), 'phase_refined'] = 'climb'
    chunk.loc[(chunk['ALT_smooth'] >= 10000) & (abs(chunk['ALT_diff_smooth']) < 10), 'phase_refined'] = 'cruise'
    chunk.loc[(chunk['ALT_diff_smooth'] < -10), 'phase_refined'] = 'descent'
    
    # Save chunk to disk
    output_file = os.path.join(output_folder, f'processed_chunk_{chunk_number}.csv')
    chunk.to_csv(output_file, index=False)
    print(f"✅ Saved {output_file} with shape {chunk.shape}")
    
    chunk_number += 1


✅ Saved output/processed_chunks/processed_chunk_0.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_1.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_2.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_3.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_4.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_5.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_6.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_7.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_8.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_9.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_10.csv with shape (1000000, 196)
✅ Saved output/processed_chunks/processed_chunk_11.csv with shape (949455, 196)


In [None]:
# Title: Combine Processed CSV Chunks

# Description:
# Merges all CSV files from 'processed_chunks' into a single file,
# writing the header only once and appending subsequent data without headers.


In [None]:

processed_folder = 'output/processed_chunks'
output_file = 'output/combined_all.csv'

# Remove output file if it exists (optional)
if os.path.exists(output_file):
    os.remove(output_file)

for i, file in enumerate(os.listdir(processed_folder)):
    if file.endswith('.csv'):
        file_path = os.path.join(processed_folder, file)
        df = pd.read_csv(file_path)

        # Write the first file with header, others append without header
        if i == 0:
            df.to_csv(output_file, index=False)
        else:
            df.to_csv(output_file, mode='a', header=False, index=False)

print("✅ Combined file saved to disk safely.")


OSError: [Errno 28] No space left on device

In [None]:
# Title: Load and Combine Processed CSV Chunks into DataFrame

# Description:
# Reads all CSV files from 'processed_chunks' folder,
# concatenates them into a single in-memory DataFrame for analysis.

In [None]:

processed_folder = 'output/processed_chunks'
combined_chunks = []

for file in os.listdir(processed_folder):
    if file.endswith('.csv'):
        file_path = os.path.join(processed_folder, file)
        df = pd.read_csv(file_path)
        combined_chunks.append(df)

combined_df = pd.concat(combined_chunks, ignore_index=True)
print(f"✅ Combined DataFrame shape: {combined_df.shape}")


: 

In [None]:
print(combined_df['phase_refined'].value_counts())

