In [None]:
import pandas as pd
import glob
import os

# Path to the current directory containing the CSV files
folder_path = '.'  # Current directory
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Parameters for filtering
delay_threshold = 120  # Delay threshold in minutes

# Process each file separately
for file in csv_files:
    df = pd.read_csv(file, encoding='ISO-8859-1')
    
    # Convert FlightDate to datetime
    df['FlightDate'] = pd.to_datetime(df['FlightDate'], errors='coerce')
    
    # Convert time columns to datetime format
    df['CRSDepTime_timezone'] = pd.to_datetime(df['CRSDepTime_timezone'], errors='coerce')
    df['DepTime_timezone'] = pd.to_datetime(df['DepTime_timezone'], errors='coerce')
    df['CRSArrTime_timezone'] = pd.to_datetime(df['CRSArrTime_timezone'], errors='coerce')
    df['ArrTime_timezone'] = pd.to_datetime(df['ArrTime_timezone'], errors='coerce')
    
    # Extract year from FlightDate
    year = df['FlightDate'].dt.year.iloc[0]  # Assumes each file contains data from only one year
    
    # Filter for the 2022 Seattle-specific Alaska Airlines issue
    if year == 2022:
        alaska_seattle_data = df[
            ((df['Marketing_Airline_Network'] == 'AS') | (df['DOT_ID_Operating_Airline'] == 'AS')) &  # Alaska Airlines
            ((df['Origin'] == 'SEA') | (df['Dest'] == 'SEA')) &  # Seattle-Tacoma International Airport (SEA)
            ((df['DepDelay'] > delay_threshold) | (df['ArrDelay'] > delay_threshold))  # Significant delays
        ]
        # Remove these identified rows from the data
        df = df.drop(alaska_seattle_data.index)
    
    # Filter out rows with significant delays likely due to snowstorms
    outliers = df[
        (df['DepDelay'] > delay_threshold) |
        (df['ArrDelay'] > delay_threshold) |
        (df.get('CarrierDelay', 0) > delay_threshold) |
        (df.get('WeatherDelay', 0) > delay_threshold) |
        (df.get('NASDelay', 0) > delay_threshold)
    ]
    
    # Remove the identified snowstorm-related delay outliers
    df = df.drop(outliers.index)
    
    # Define a new filename for the cleaned data
    base_name = os.path.basename(file)
    cleaned_filename = f"cleaned_{base_name}"
    cleaned_filepath = os.path.join(folder_path, cleaned_filename)
    
    # Save the cleaned dataset for this file
    df.to_csv(cleaned_filepath, index=False)
    print(f"Cleaned data saved as {cleaned_filename}")


In [None]:
import pandas as pd
import glob
import os

folder_path = '.'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Process each file separately
for file in csv_files:
    df = pd.read_csv(file, encoding='ISO-8859-1')
    
    # Check if both required columns are present before creating flight_id
    if 'Marketing_Airline_Network' in df.columns and 'Flight_Number_Marketing_Airline' in df.columns:
        # Combine 'Marketing_Airline_Network' and 'Flight_Number_Marketing_Airline' into 'flight_id'
        df['flight_id'] = df['IATA_Code_Operating_Airline'].astype(str) + df['Flight_Number_Operating_Airline'].astype(str)
        
        # Drop the original columns
        df = df.drop(['Flight_Number_Operating_Airline'], axis=1)
        
        # Insert 'flight_id' as the fourth column (index 3)
        flight_id_col = df.pop('Operating_flight_id')  # Remove 'flight_id' temporarily
        df.insert(5, 'Operating_flight_id', flight_id_col)  # Insert 'flight_id' at index 3
        
        # Define a new filename for the modified data
        base_name = os.path.basename(file)
        modified_filename = f"modified_{base_name}"
        modified_filepath = os.path.join(folder_path, modified_filename)
        
        # Save the modified dataset for this file
        df.to_csv(modified_filepath, index=False)
        print(f"Modified data with flight_id saved as {modified_filename}")
    else:
        print(f"Skipping {file}: Required columns not found")


In [None]:
# Path to the current directory containing the CSV files
folder_path = '.'  
csv_files = sorted(glob.glob(os.path.join(folder_path, '*.csv')))  # Sort files to ensure correct order

# Initialize the starting index for the first file
start_index = 1

# Process each file with a sequential index range
for file in csv_files:
    # Load CSV file
    df = pd.read_csv(file, encoding='ISO-8859-1')
    
    # Set the index to start from the current starting index
    df.index = range(start_index, start_index + len(df))
    
    # Drop the second column if it exists
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    
    # Define a new filename for the modified data
    base_name = os.path.basename(file)
    indexed_filename = f"{base_name}"
    indexed_filepath = os.path.join(folder_path, indexed_filename)
    
    # Save the modified dataset for this file with the updated index
    df.to_csv(indexed_filepath, index=True)  # Save with index
    print(f"Data with updated index saved as {indexed_filename}")
    
    # Update the starting index for the next file based on the number of rows in the current file
    start_index += len(df)
