In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

np.random.seed(42)

# -----------------------------
# 1. ROUTES TABLE
# -----------------------------
routes = [
    ("DOH", "LHR", "Europe", 1),
    ("DOH", "JFK", "North America", 1),
    ("DOH", "NBO", "Africa", 1),
    ("DOH", "DEL", "Asia", 1),
    ("DOH", "BKK", "Asia", 1),
    ("DOH", "FRA", "Europe", 1),
    ("DOH", "DXB", "Middle East", 0),
    ("DOH", "CAI", "Africa", 1),
]

routes_df = pd.DataFrame(routes, columns=[
    "origin", "destination", "region", "hub_flag"
])
routes_df["route_id"] = range(1, len(routes_df) + 1)

# -----------------------------
# 2. FLIGHTS TABLE
# -----------------------------
aircrafts = {
    "A320": 150,
    "A330": 250,
    "B777": 300,
    "A350": 320
}

flight_records = []
flight_id = 1

start_date = datetime(2025, 1, 1)

for _, route in routes_df.iterrows():
    for i in range(180):  # 6 months of daily flights
        aircraft = random.choice(list(aircrafts.keys()))
        flight_records.append([
            flight_id,
            route["route_id"],
            start_date + timedelta(days=i),
            aircraft,
            aircrafts[aircraft],
            random.randint(3000, 12000)
        ])
        flight_id += 1

flights_df = pd.DataFrame(flight_records, columns=[
    "flight_id", "route_id", "flight_date",
    "aircraft_type", "capacity", "distance_km"
])

# -----------------------------
# 3. BOOKINGS TABLE
# -----------------------------
fare_classes = [
    ("F", "First", 5000),
    ("J", "Business", 3200),
    ("Y", "Economy", 900),
    ("M", "Economy", 600),
    ("L", "Economy", 350)
]

channels = ["Direct", "OTA", "GDS", "Corporate"]
pos_countries = ["Qatar", "UK", "USA", "Kenya", "India", "Thailand", "Germany"]

booking_records = []
booking_id = 1

for _, flight in flights_df.iterrows():
    num_bookings = random.randint(60, flight["capacity"])
    for _ in range(num_bookings):
        fare = random.choice(fare_classes)
        days_before = random.randint(1, 180)
        booking_records.append([
            booking_id,
            flight["flight_id"],
            flight["flight_date"] - timedelta(days=days_before),
            flight["flight_date"],
            fare[0],
            fare[1],
            round(fare[2] * random.uniform(0.85, 1.15), 2),
            random.randint(1, 3),
            random.choice(pos_countries),
            random.choice(channels)
        ])
        booking_id += 1

bookings_df = pd.DataFrame(booking_records, columns=[
    "booking_id", "flight_id", "booking_date",
    "travel_date", "fare_class", "cabin",
    "price", "passenger_count",
    "pos_country", "sales_channel"
])

# -----------------------------
# 4. REVENUE MANAGEMENT ACTIONS
# -----------------------------
actions = ["Price Increase", "Price Discount", "Capacity Increase", "Sales Campaign"]

rm_records = []

for route_id in routes_df["route_id"]:
    for _ in range(3):
        start = start_date + timedelta(days=random.randint(0, 120))
        rm_records.append([
            route_id,
            random.choice(actions),
            start,
            start + timedelta(days=random.randint(15, 45))
        ])

rm_df = pd.DataFrame(rm_records, columns=[
    "route_id", "action_type", "start_date", "end_date"
])
rm_df["action_id"] = range(1, len(rm_df) + 1)

# -----------------------------
# 5. COMPETITOR TABLE
# -----------------------------
competitors = ["Emirates", "Turkish Airlines", "Ethiopian Airlines"]

comp_records = []

for route_id in routes_df["route_id"]:
    for comp in competitors:
        comp_records.append([
            route_id,
            comp,
            round(random.uniform(400, 1200), 2),
            round(random.uniform(0.1, 0.5), 2)
        ])

competition_df = pd.DataFrame(comp_records, columns=[
    "route_id", "competitor",
    "avg_competitor_fare", "market_share"
])

# -----------------------------
# SAVE TO CSV
# -----------------------------
routes_df.to_csv("routes.csv", index=False)
flights_df.to_csv("flights.csv", index=False)
bookings_df.to_csv("bookings.csv", index=False)
rm_df.to_csv("revenue_management_actions.csv", index=False)
competition_df.to_csv("competition.csv", index=False)

print("‚úÖ Airline revenue datasets generated successfully!")


‚úÖ Airline revenue datasets generated successfully!


In [3]:
# -----------------------------
# 3. BOOKINGS TABLE (Adjusted to avoid SmallInt Overflow)
# -----------------------------
fare_classes = [
    ("F", "First", 5000),
    ("J", "Business", 3200),
    ("Y", "Economy", 900),
    ("M", "Economy", 600),
    ("L", "Economy", 350)
]

channels = ["Direct", "OTA", "GDS", "Corporate"]
pos_countries = ["Qatar", "UK", "USA", "Kenya", "India", "Thailand", "Germany"]

booking_records = []
booking_id = 1
max_smallint = 32000 # Staying safely under the 32,767 limit

# We iterate through flights but break if we reach the SQL SmallInt limit
for _, flight in flights_df.iterrows():
    if booking_id > max_smallint:
        break
        
    # Reduced number of bookings per flight to spread data across more flights
    # while staying under the ID limit
    num_bookings = random.randint(10, 25) 
    
    for _ in range(num_bookings):
        if booking_id > max_smallint:
            break
            
        fare = random.choice(fare_classes)
        days_before = random.randint(1, 180)
        booking_records.append([
            booking_id,
            flight["flight_id"],
            flight["flight_date"] - timedelta(days=days_before),
            flight["flight_date"],
            fare[0],
            fare[1],
            round(fare[2] * random.uniform(0.85, 1.15), 2),
            random.randint(1, 3),
            random.choice(pos_countries),
            random.choice(channels)
        ])
        booking_id += 1

bookings_df = pd.DataFrame(booking_records, columns=[
    "booking_id", "flight_id", "booking_date",
    "travel_date", "fare_class", "cabin",
    "price", "passenger_count",
    "pos_country", "sales_channel"
])

In [4]:
# Distribute bookings more evenly across all flights
for flight_idx, flight in flights_df.iterrows():
    if booking_id > max_smallint:
        break
    
    # Calculate proportional bookings per flight
    remaining_flights = len(flights_df) - flight_idx
    remaining_bookings = max_smallint - booking_id
    if remaining_flights > 0:
        avg_per_flight = remaining_bookings // remaining_flights
        num_bookings = random.randint(max(1, avg_per_flight-5), avg_per_flight+5)
    else:
        num_bookings = 0

In [5]:
import sys

booking_records = []
booking_id = 1
max_smallint = 32000
total_generated = 0

print("Generating bookings...")
print("Progress: ", end="")

for flight_idx, flight in flights_df.iterrows():
    if booking_id > max_smallint:
        break
        
    num_bookings = random.randint(10, 25)
    
    for _ in range(num_bookings):
        if booking_id > max_smallint:
            break
            
        # ... [booking generation code] ...
        
        booking_id += 1
        total_generated += 1
        
        # Simple progress indicator
        if total_generated % 500 == 0:
            print("‚ñà", end="", flush=True)

print(f"\nDone! Generated {total_generated:,} bookings.")

Generating bookings...
‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Done! Generated 24,977 bookings.


In [6]:
# Simple example to run the bookings generator
def main():
    # Create sample flights data (replace with your actual data)
    flight_dates = [datetime(2024, 1, 1) + timedelta(days=i) for i in range(100)]
    flights_data = []
    for i, date in enumerate(flight_dates):
        flights_data.append({
            "flight_id": i+1,
            "flight_date": date,
            "flight_number": f"QR{i+1:03d}",
            "origin": "DOH",
            "destination": random.choice(["LHR", "JFK", "BKK", "DEL", "NBO"])
        })
    
    flights_df = pd.DataFrame(flights_data)
    
    # Generate bookings and save to CSV
    bookings_df = generate_bookings_data(
        flights_df=flights_df,
        max_bookings=5000,  # Smaller for testing
        output_csv='airline_bookings.csv'
    )
    
    # Optional: Save in additional formats
    save_bookings_to_file(bookings_df, 'bookings_backup', 'excel')
    
    print("\nüéâ All done! Files are ready for use.")

if __name__ == "__main__":
    from datetime import datetime
    main()

NameError: name 'generate_bookings_data' is not defined

In [7]:
# ============================================================================
# AIRLINE BOOKINGS DATA GENERATOR
# ============================================================================
# This script generates realistic airline booking data with:
# - Flight information
# - Passenger bookings
# - Progress tracking
# - CSV export functionality
# ============================================================================

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import sys

# ============================================================================
# 1. FLIGHTS DATA GENERATION
# ============================================================================
def generate_flights_data(num_flights=500, start_date="2024-01-01", end_date="2024-12-31"):
    """
    Generate sample flights data
    
    Parameters:
    -----------
    num_flights : int
        Number of flights to generate
    start_date : str
        Start date for flights (YYYY-MM-DD)
    end_date : str
        End date for flights (YYYY-MM-DD)
    """
    
    print("=" * 70)
    print("‚úàÔ∏è  GENERATING FLIGHTS DATA")
    print("=" * 70)
    
    # Airport codes and routes
    airports = {
        "DOH": {"name": "Hamad International Airport", "city": "Doha", "country": "Qatar"},
        "LHR": {"name": "Heathrow Airport", "city": "London", "country": "UK"},
        "JFK": {"name": "John F. Kennedy International", "city": "New York", "country": "USA"},
        "BKK": {"name": "Suvarnabhumi Airport", "city": "Bangkok", "country": "Thailand"},
        "DEL": {"name": "Indira Gandhi International", "city": "Delhi", "country": "India"},
        "NBO": {"name": "Jomo Kenyatta International", "city": "Nairobi", "country": "Kenya"},
        "FRA": {"name": "Frankfurt Airport", "city": "Frankfurt", "country": "Germany"},
        "DXB": {"name": "Dubai International", "city": "Dubai", "country": "UAE"},
        "SIN": {"name": "Changi Airport", "city": "Singapore", "country": "Singapore"},
        "SYD": {"name": "Sydney Airport", "city": "Sydney", "country": "Australia"}
    }
    
    # Popular routes from Doha
    routes = [
        ("DOH", "LHR"), ("DOH", "JFK"), ("DOH", "BKK"), ("DOH", "DEL"),
        ("DOH", "NBO"), ("DOH", "FRA"), ("DOH", "DXB"), ("DOH", "SIN"),
        ("LHR", "JFK"), ("BKK", "SIN"), ("FRA", "JFK"), ("DEL", "BKK")
    ]
    
    # Aircraft types with capacities
    aircraft = [
        ("B77W", "Boeing 777-300ER", 370),
        ("A359", "Airbus A350-900", 325),
        ("B789", "Boeing 787-9", 290),
        ("A333", "Airbus A330-300", 275),
        ("B738", "Boeing 737-800", 180)
    ]
    
    # Generate flight dates between start and end date
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    date_range = (end - start).days
    
    flights_data = []
    
    print(f"üìä Generating {num_flights:,} flights...")
    pbar = tqdm(total=num_flights, desc="Creating flights", unit="flights")
    
    for i in range(1, num_flights + 1):
        # Random flight date
        flight_date = start + timedelta(days=random.randint(0, date_range))
        
        # Random route
        origin, destination = random.choice(routes)
        
        # Random aircraft
        aircraft_code, aircraft_name, capacity = random.choice(aircraft)
        
        # Flight number (QR = Qatar Airways code)
        flight_number = f"QR{random.randint(100, 999)}"
        
        # Generate random departure time (between 00:00 and 23:30)
        departure_hour = random.randint(0, 23)
        departure_minute = random.choice([0, 15, 30, 45])
        departure_time = flight_date.replace(hour=departure_hour, minute=departure_minute)
        
        # Flight duration based on route (in hours)
        route_key = f"{origin}-{destination}"
        route_durations = {
            "DOH-LHR": 7, "DOH-JFK": 12, "DOH-BKK": 7, "DOH-DEL": 3,
            "DOH-NBO": 5, "DOH-FRA": 6, "DOH-DXB": 1, "DOH-SIN": 8,
            "LHR-JFK": 8, "BKK-SIN": 2, "FRA-JFK": 9, "DEL-BKK": 4
        }
        duration_hours = route_durations.get(route_key, random.randint(2, 12))
        
        # Arrival time
        arrival_time = departure_time + timedelta(hours=duration_hours)
        
        # Create flight record
        flights_data.append({
            "flight_id": i,
            "flight_number": flight_number,
            "flight_date": flight_date.date(),
            "departure_time": departure_time,
            "arrival_time": arrival_time,
            "origin": origin,
            "destination": destination,
            "aircraft_code": aircraft_code,
            "aircraft_name": aircraft_name,
            "capacity": capacity,
            "duration_hours": duration_hours,
            "airline": "Qatar Airways"
        })
        
        pbar.update(1)
    
    pbar.close()
    
    # Create DataFrame
    flights_df = pd.DataFrame(flights_data)
    
    # Convert datetime columns
    flights_df['flight_date'] = pd.to_datetime(flights_df['flight_date'])
    flights_df['departure_time'] = pd.to_datetime(flights_df['departure_time'])
    flights_df['arrival_time'] = pd.to_datetime(flights_df['arrival_time'])
    
    print(f"\n‚úÖ Generated {len(flights_df):,} flights")
    print(f"   Date range: {flights_df['flight_date'].min().date()} to {flights_df['flight_date'].max().date()}")
    print(f"   Routes: {flights_df['origin'].nunique()} origins ‚Üí {flights_df['destination'].nunique()} destinations")
    
    return flights_df

# ============================================================================
# 2. BOOKINGS DATA GENERATION
# ============================================================================
def generate_bookings_data(flights_df, max_bookings=32000, output_csv='bookings.csv'):
    """
    Generate realistic bookings data and save to CSV file
    
    Parameters:
    -----------
    flights_df : pandas DataFrame
        DataFrame containing flight information
    max_bookings : int
        Maximum number of bookings to generate (default: 32,000)
    output_csv : str
        Output CSV file name/path
    """
    
    print("\n" + "=" * 70)
    print("üìñ GENERATING BOOKINGS DATA")
    print("=" * 70)
    print(f"üöÄ Starting bookings data generation...")
    print(f"   Max bookings to generate: {max_bookings:,}")
    print(f"   Output file: {output_csv}")
    print("-" * 70)
    
    # Define fare classes with code, name, and base price
    fare_classes = [
        ("F", "First", 5000),
        ("J", "Business", 3200),
        ("W", "Premium Economy", 1500),
        ("Y", "Economy", 900),
        ("M", "Economy", 600),
        ("L", "Economy", 350)
    ]
    
    # Sales channels and point-of-sale countries
    channels = ["Direct", "OTA", "GDS", "Corporate", "Travel Agent"]
    pos_countries = ["Qatar", "UK", "USA", "Kenya", "India", "Thailand", 
                     "Germany", "UAE", "Australia", "Singapore", "France", "Japan"]
    
    # Passenger names for realistic data
    first_names = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer", 
                   "Michael", "Linda", "William", "Elizabeth", "David", "Susan",
                   "Richard", "Jessica", "Joseph", "Sarah", "Thomas", "Karen",
                   "Charles", "Nancy", "Ahmed", "Fatima", "Mohammed", "Aisha",
                   "Ali", "Zainab", "Hassan", "Mariam", "Omar", "Layla"]
    
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia",
                  "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez",
                  "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore",
                  "Jackson", "Martin", "Al-Sayed", "Al-Khalifa", "Al-Thani", "Khan",
                  "Patel", "Singh", "Chen", "Wang", "Tanaka", "Kim"]
    
    # Initialize storage
    booking_records = []
    passenger_records = []
    booking_id = 1
    passenger_id = 1
    
    # Calculate statistics for progress bar
    total_flights = len(flights_df)
    total_potential_bookings = min(max_bookings, total_flights * 30)
    
    print(f"üìä Processing {total_flights:,} flights...")
    print(f"üìà Estimated bookings: {total_potential_bookings:,}")
    
    # Create progress bar
    pbar = tqdm(total=total_potential_bookings, 
                desc="Generating bookings", 
                unit="bookings",
                bar_format='{l_bar}{bar:50}{r_bar}{bar:-50b}')
    
    # Generate bookings for each flight
    for _, flight in flights_df.iterrows():
        if booking_id > max_bookings:
            break
            
        # Random number of bookings per flight (based on capacity)
        capacity_utilization = random.uniform(0.7, 0.95)  # 70-95% full
        num_bookings = int(flight["capacity"] * capacity_utilization / random.uniform(1.5, 3.0))
        num_bookings = max(5, min(num_bookings, 50))  # Limit between 5 and 50
        
        for _ in range(num_bookings):
            if booking_id > max_bookings:
                break
                
            # Random fare class selection with weighted probabilities
            fare_weights = [0.05, 0.15, 0.10, 0.30, 0.25, 0.15]  # Higher probability for economy
            fare = random.choices(fare_classes, weights=fare_weights, k=1)[0]
            
            # Random booking date (1-180 days before flight, weighted toward closer dates)
            days_before_options = list(range(1, 181))
            # Weight: earlier bookings less likely than last-minute
            weights = [1/(i**0.7) for i in days_before_options]
            days_before = random.choices(days_before_options, weights=weights, k=1)[0]
            
            # Calculate booking date and time
            booking_date = flight["flight_date"] - timedelta(days=days_before)
            
            # Add random time to booking date
            booking_time = booking_date.replace(
                hour=random.randint(0, 23),
                minute=random.choice([0, 15, 30, 45])
            )
            
            # Price calculation with seasonal variation and booking time factor
            base_price = fare[2]
            
            # Seasonal factor (higher in summer and holidays)
            month = flight["flight_date"].month
            if month in [6, 7, 8, 12]:  # Summer and December
                season_factor = random.uniform(1.1, 1.3)
            elif month in [1, 2, 9]:  # Lower season
                season_factor = random.uniform(0.9, 1.0)
            else:
                season_factor = random.uniform(1.0, 1.1)
            
            # Booking time factor (last-minute bookings more expensive)
            if days_before <= 7:
                time_factor = random.uniform(1.2, 1.5)
            elif days_before <= 14:
                time_factor = random.uniform(1.1, 1.3)
            else:
                time_factor = random.uniform(0.85, 1.15)
            
            # Channel discount factor
            channel = random.choice(channels)
            channel_discounts = {
                "Direct": random.uniform(0.95, 1.05),  # No discount
                "OTA": random.uniform(0.85, 0.95),     # 5-15% discount
                "GDS": random.uniform(0.88, 0.98),     # 2-12% discount
                "Corporate": random.uniform(0.75, 0.85), # 15-25% discount
                "Travel Agent": random.uniform(0.90, 1.0) # 0-10% discount
            }
            channel_factor = channel_discounts[channel]
            
            # Calculate final price
            final_price = round(base_price * season_factor * time_factor * channel_factor, 2)
            
            # Random passenger count
            passenger_count = random.choices([1, 2, 3, 4], weights=[0.4, 0.35, 0.15, 0.1], k=1)[0]
            
            # POS country (biased toward origin/destination countries)
            origin_country = "Qatar" if flight["origin"] == "DOH" else "Other"
            destination_country_map = {
                "LHR": "UK", "JFK": "USA", "BKK": "Thailand", 
                "DEL": "India", "NBO": "Kenya", "FRA": "Germany"
            }
            destination_country = destination_country_map.get(flight["destination"], random.choice(pos_countries))
            
            # Weight POS country selection
            pos_country_choices = [destination_country, origin_country, random.choice(pos_countries)]
            pos_country = random.choices(pos_country_choices, weights=[0.5, 0.3, 0.2], k=1)[0]
            
            # Generate passenger records for this booking
            passengers_in_booking = []
            for pax_num in range(passenger_count):
                # Random passenger details
                gender = random.choice(["M", "F"])
                first_name = random.choice([n for n in first_names if 
                                           (gender == "M" and n not in ["Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Susan", "Jessica", "Sarah", "Karen", "Nancy", "Fatima", "Aisha", "Zainab", "Mariam", "Layla"]) or
                                           (gender == "F" and n not in ["James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph", "Thomas", "Charles", "Ahmed", "Mohammed", "Ali", "Hassan", "Omar"])])
                last_name = random.choice(last_names)
                dob = flight["flight_date"] - timedelta(days=random.randint(18*365, 70*365))
                
                # Generate random passport number
                passport = f"{random.choice(['A', 'B', 'C', 'P'])}{random.randint(100000, 999999)}"
                
                passenger_records.append({
                    "passenger_id": passenger_id,
                    "booking_id": booking_id,
                    "first_name": first_name,
                    "last_name": last_name,
                    "gender": gender,
                    "date_of_birth": dob.date(),
                    "passport_number": passport,
                    "nationality": pos_country,
                    "seat_number": f"{random.choice(['A', 'B', 'C', 'D', 'E', 'F'])}{random.randint(1, 40)}"
                })
                
                passengers_in_booking.append(f"{first_name} {last_name}")
                passenger_id += 1
            
            # Create booking record
            booking_records.append({
                "booking_id": booking_id,
                "flight_id": flight["flight_id"],
                "booking_datetime": booking_time,
                "travel_date": flight["flight_date"].date(),
                "fare_class": fare[0],
                "cabin": fare[1],
                "price": final_price,
                "passenger_count": passenger_count,
                "passenger_names": ", ".join(passengers_in_booking),
                "pos_country": pos_country,
                "sales_channel": channel,
                "payment_method": random.choice(["Credit Card", "Debit Card", "Bank Transfer", "Corporate Account"]),
                "booking_status": random.choices(["Confirmed", "Cancelled"], weights=[0.92, 0.08], k=1)[0],
                "cancellation_date": None if random.random() > 0.08 else booking_time + timedelta(days=random.randint(1, days_before-1))
            })
            
            booking_id += 1
            pbar.update(1)
            
            # Update progress bar description occasionally
            if booking_id % 1000 == 0:
                pbar.set_description(f"Creating bookings (ID: {booking_id:,})")
    
    # Close progress bar
    pbar.close()
    
    # Create DataFrames
    bookings_df = pd.DataFrame(booking_records)
    passengers_df = pd.DataFrame(passenger_records)
    
    # Convert date columns
    bookings_df['booking_datetime'] = pd.to_datetime(bookings_df['booking_datetime'])
    bookings_df['travel_date'] = pd.to_datetime(bookings_df['travel_date'])
    passengers_df['date_of_birth'] = pd.to_datetime(passengers_df['date_of_birth'])
    
    # Calculate cancellation rate
    cancelled_count = bookings_df[bookings_df['booking_status'] == 'Cancelled'].shape[0]
    cancellation_rate = (cancelled_count / len(bookings_df)) * 100 if len(bookings_df) > 0 else 0
    
    print(f"\n‚úÖ Generated {len(bookings_df):,} booking records")
    print(f"‚úÖ Generated {len(passengers_df):,} passenger records")
    print(f"üìä Cancellation rate: {cancellation_rate:.1f}% ({cancelled_count:,} cancelled bookings)")
    
    return bookings_df, passengers_df

# ============================================================================
# 3. CSV EXPORT FUNCTION
# ============================================================================
def export_to_csv(bookings_df, passengers_df=None, base_filename="airline_data", 
                  export_passengers=True, compress=False):
    """
    Export data to CSV files
    
    Parameters:
    -----------
    bookings_df : pandas DataFrame
        Bookings data
    passengers_df : pandas DataFrame, optional
        Passengers data
    base_filename : str
        Base name for output files
    export_passengers : bool
        Whether to export passengers data
    compress : bool
        Whether to compress the CSV files
    """
    
    print("\n" + "=" * 70)
    print("üíæ EXPORTING DATA TO CSV")
    print("=" * 70)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Export bookings data
    bookings_filename = f"{base_filename}_bookings_{timestamp}.csv"
    if compress:
        bookings_filename += ".gz"
        compression = 'gzip'
    else:
        compression = None
    
    print(f"üìÅ Exporting bookings data to {bookings_filename}...")
    bookings_df.to_csv(bookings_filename, index=False, compression=compression)
    
    # Calculate file size
    bookings_size = os.path.getsize(bookings_filename)
    
    # Export passengers data if provided
    passengers_filename = None
    if export_passengers and passengers_df is not None:
        passengers_filename = f"{base_filename}_passengers_{timestamp}.csv"
        if compress:
            passengers_filename += ".gz"
        
        print(f"üìÅ Exporting passengers data to {passengers_filename}...")
        passengers_df.to_csv(passengers_filename, index=False, compression=compression)
        
        passengers_size = os.path.getsize(passengers_filename)
    
    print("\n‚úÖ Export complete!")
    print("-" * 70)
    
    # Format file sizes
    def format_size(size_bytes):
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size_bytes < 1024.0:
                return f"{size_bytes:.2f} {unit}"
            size_bytes /= 1024.0
        return f"{size_bytes:.2f} TB"
    
    print(f"üìä BOOKINGS FILE:")
    print(f"   Name: {bookings_filename}")
    print(f"   Size: {format_size(bookings_size)}")
    print(f"   Rows: {len(bookings_df):,}")
    print(f"   Columns: {len(bookings_df.columns)}")
    
    if export_passengers and passengers_df is not None:
        print(f"\nüìä PASSENGERS FILE:")
        print(f"   Name: {passengers_filename}")
        print(f"   Size: {format_size(passengers_size)}")
        print(f"   Rows: {len(passengers_df):,}")
        print(f"   Columns: {len(passengers_df.columns)}")
    
    print(f"\nüìÅ Files saved in: {os.path.abspath('.')}")
    
    return bookings_filename, passengers_filename

# ============================================================================
# 4. DATA ANALYSIS & SUMMARY
# ============================================================================
def analyze_data(bookings_df, passengers_df=None):
    """
    Analyze and display summary statistics of the generated data
    """
    
    print("\n" + "=" * 70)
    print("üìà DATA ANALYSIS & SUMMARY")
    print("=" * 70)
    
    # Bookings analysis
    print(f"\nüìñ BOOKINGS SUMMARY:")
    print(f"   Total bookings: {len(bookings_df):,}")
    print(f"   Date range: {bookings_df['travel_date'].min().date()} to {bookings_df['travel_date'].max().date()}")
    print(f"   Booking lead time (avg): {(bookings_df['travel_date'] - bookings_df['booking_datetime'].dt.date).mean().days:.1f} days")
    
    # Revenue analysis
    total_revenue = bookings_df[bookings_df['booking_status'] == 'Confirmed']['price'].sum()
    avg_price = bookings_df[bookings_df['booking_status'] == 'Confirmed']['price'].mean()
    
    print(f"\nüí∞ REVENUE ANALYSIS:")
    print(f"   Total revenue: ${total_revenue:,.2f}")
    print(f"   Average booking price: ${avg_price:.2f}")
    print(f"   Min price: ${bookings_df['price'].min():.2f}")
    print(f"   Max price: ${bookings_df['price'].max():.2f}")
    
    # Fare class distribution
    print(f"\nüé´ FARE CLASS DISTRIBUTION:")
    fare_dist = bookings_df['fare_class'].value_counts().sort_index()
    for fare_class, count in fare_dist.items():
        percentage = (count / len(bookings_df)) * 100
        cabin = bookings_df[bookings_df['fare_class'] == fare_class]['cabin'].iloc[0]
        avg_fare = bookings_df[bookings_df['fare_class'] == fare_class]['price'].mean()
        print(f"   {fare_class} ({cabin}): {count:,} bookings ({percentage:.1f}%) | Avg: ${avg_fare:.2f}")
    
    # Sales channel analysis
    print(f"\nüõí SALES CHANNEL ANALYSIS:")
    channel_dist = bookings_df['sales_channel'].value_counts()
    for channel, count in channel_dist.items():
        percentage = (count / len(bookings_df)) * 100
        avg_channel_price = bookings_df[bookings_df['sales_channel'] == channel]['price'].mean()
        print(f"   {channel}: {count:,} bookings ({percentage:.1f}%) | Avg price: ${avg_channel_price:.2f}")
    
    # Booking status
    print(f"\nüìä BOOKING STATUS:")
    status_dist = bookings_df['booking_status'].value_counts()
    for status, count in status_dist.items():
        percentage = (count / len(bookings_df)) * 100
        print(f"   {status}: {count:,} bookings ({percentage:.1f}%)")
    
    # POS country analysis
    print(f"\nüåç TOP 5 POS COUNTRIES:")
    pos_dist = bookings_df['pos_country'].value_counts().head()
    for country, count in pos_dist.items():
        percentage = (count / len(bookings_df)) * 100
        print(f"   {country}: {count:,} bookings ({percentage:.1f}%)")
    
    # Passengers analysis if available
    if passengers_df is not None:
        print(f"\nüë• PASSENGERS SUMMARY:")
        print(f"   Total passengers: {len(passengers_df):,}")
        print(f"   Unique nationalities: {passengers_df['nationality'].nunique()}")
        
        # Gender distribution
        gender_dist = passengers_df['gender'].value_counts()
        print(f"   Gender distribution:")
        for gender, count in gender_dist.items():
            percentage = (count / len(passengers_df)) * 100
            print(f"     {gender}: {count:,} passengers ({percentage:.1f}%)")
    
    # Monthly revenue trend
    print(f"\nüìÖ MONTHLY REVENUE TREND:")
    bookings_df['travel_month'] = bookings_df['travel_date'].dt.to_period('M')
    monthly_revenue = bookings_df[bookings_df['booking_status'] == 'Confirmed'].groupby('travel_month')['price'].sum()
    for month, revenue in monthly_revenue.items():
        print(f"   {month}: ${revenue:,.2f}")
    
    # Sample data preview
    print(f"\nüëÅÔ∏è  SAMPLE DATA (first 3 bookings):")
    print(bookings_df.head(3).to_string())
    
    if passengers_df is not None:
        print(f"\nüëÅÔ∏è  SAMPLE PASSENGERS (first 3):")
        print(passengers_df.head(3).to_string())

# ============================================================================
# 5. MAIN FUNCTION
# ============================================================================
def main():
    """
    Main execution function
    """
    
    print("=" * 70)
    print("üöÄ AIRLINE BOOKINGS DATA GENERATOR")
    print("=" * 70)
    print("This script generates realistic airline booking data.")
    print("=" * 70)
    
    # Configuration
    config = {
        "num_flights": 200,           # Number of flights to generate
        "max_bookings": 10000,        # Maximum bookings to generate
        "start_date": "2024-01-01",   # Start date for flights
        "end_date": "2024-06-30",     # End date for flights
        "base_filename": "airline",   # Base name for output files
        "export_passengers": True,    # Export passengers data
        "compress_csv": False         # Compress CSV files
    }
    
    print("\n‚öôÔ∏è  CONFIGURATION:")
    for key, value in config.items():
        print(f"   {key}: {value}")
    
    # Ask for confirmation
    print("\n" + "-" * 70)
    response = input("üëâ Press Enter to start generation or 'q' to quit: ")
    if response.lower() == 'q':
        print("Exiting...")
        return
    
    try:
        # Step 1: Generate flights data
        flights_df = generate_flights_data(
            num_flights=config["num_flights"],
            start_date=config["start_date"],
            end_date=config["end_date"]
        )
        
        # Step 2: Generate bookings data
        bookings_df, passengers_df = generate_bookings_data(
            flights_df=flights_df,
            max_bookings=config["max_bookings"],
            output_csv="temp_bookings.csv"  # Temporary file
        )
        
        # Step 3: Export to CSV
        bookings_file, passengers_file = export_to_csv(
            bookings_df=bookings_df,
            passengers_df=passengers_df if config["export_passengers"] else None,
            base_filename=config["base_filename"],
            export_passengers=config["export_passengers"],
            compress=config["compress_csv"]
        )
        
        # Step 4: Analyze data
        analyze_data(bookings_df, passengers_df if config["export_passengers"] else None)
        
        # Step 5: Optional - Save flights data
        save_flights = input("\nüëâ Save flights data as CSV? (y/n): ")
        if save_flights.lower() == 'y':
            flights_file = f"{config['base_filename']}_flights_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            flights_df.to_csv(flights_file, index=False)
            print(f"‚úÖ Flights data saved to: {flights_file}")
        
        print("\n" + "=" * 70)
        print("üéâ GENERATION COMPLETE!")
        print("=" * 70)
        print(f"üìÅ Files generated:")
        print(f"   ‚Ä¢ Bookings: {bookings_file}")
        if config["export_passengers"]:
            print(f"   ‚Ä¢ Passengers: {passengers_file}")
        print(f"\nüìä Total records generated:")
        print(f"   ‚Ä¢ Flights: {len(flights_df):,}")
        print(f"   ‚Ä¢ Bookings: {len(bookings_df):,}")
        if config["export_passengers"]:
            print(f"   ‚Ä¢ Passengers: {len(passengers_df):,}")
        print("=" * 70)
        
    except Exception as e:
        print(f"\n‚ùå ERROR: {e}")
        print("Generation failed. Please check your configuration.")
        import traceback
        traceback.print_exc()

# ============================================================================
# 6. COMMAND LINE INTERFACE
# ============================================================================
if __name__ == "__main__":
    # Check for command line arguments
    if len(sys.argv) > 1:
        # Simple command line interface
        import argparse
        
        parser = argparse.ArgumentParser(description='Generate airline bookings data')
        parser.add_argument('--flights', type=int, default=200, help='Number of flights to generate')
        parser.add_argument('--bookings', type=int, default=10000, help='Maximum bookings to generate')
        parser.add_argument('--start', type=str, default='2024-01-01', help='Start date (YYYY-MM-DD)')
        parser.add_argument('--end', type=str, default='2024-06-30', help='End date (YYYY-MM-DD)')
        parser.add_argument('--output', type=str, default='airline', help='Base output filename')
        parser.add_argument('--no-passengers', action='store_true', help='Skip passengers data generation')
        parser.add_argument('--compress', action='store_true', help='Compress output CSV files')
        
        args = parser.parse_args()
        
        # Run with command line arguments
        flights_df = generate_flights_data(
            num_flights=args.flights,
            start_date=args.start,
            end_date=args.end
        )
        
        bookings_df, passengers_df = generate_bookings_data(
            flights_df=flights_df,
            max_bookings=args.bookings,
            output_csv="temp_bookings.csv"
        )
        
        export_to_csv(
            bookings_df=bookings_df,
            passengers_df=passengers_df if not args.no_passengers else None,
            base_filename=args.output,
            export_passengers=not args.no_passengers,
            compress=args.compress
        )
        
        analyze_data(bookings_df, passengers_df if not args.no_passengers else None)
    else:
        # Run interactive mode
        main()

usage: ipykernel_launcher.py [-h] [--flights FLIGHTS] [--bookings BOOKINGS] [--start START] [--end END]
                             [--output OUTPUT] [--no-passengers] [--compress]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\superadmin\AppData\Roaming\jupyter\runtime\kernel-011d4f0e-6cd1-4e08-8961-88e5c67724a9.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
# ============================================================================
# AIRLINE BOOKINGS DATA GENERATOR
# ============================================================================
# This script generates realistic airline booking data with:
# - Flight information
# - Passenger bookings
# - Progress tracking
# - CSV export functionality
# ============================================================================

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import sys

# ============================================================================
# 1. FLIGHTS DATA GENERATION
# ============================================================================
def generate_flights_data(num_flights=500, start_date="2024-01-01", end_date="2024-12-31"):
    """
    Generate sample flights data
    
    Parameters:
    -----------
    num_flights : int
        Number of flights to generate
    start_date : str
        Start date for flights (YYYY-MM-DD)
    end_date : str
        End date for flights (YYYY-MM-DD)
    """
    
    print("=" * 70)
    print("‚úàÔ∏è  GENERATING FLIGHTS DATA")
    print("=" * 70)
    
    # Airport codes and routes
    airports = {
        "DOH": {"name": "Hamad International Airport", "city": "Doha", "country": "Qatar"},
        "LHR": {"name": "Heathrow Airport", "city": "London", "country": "UK"},
        "JFK": {"name": "John F. Kennedy International", "city": "New York", "country": "USA"},
        "BKK": {"name": "Suvarnabhumi Airport", "city": "Bangkok", "country": "Thailand"},
        "DEL": {"name": "Indira Gandhi International", "city": "Delhi", "country": "India"},
        "NBO": {"name": "Jomo Kenyatta International", "city": "Nairobi", "country": "Kenya"},
        "FRA": {"name": "Frankfurt Airport", "city": "Frankfurt", "country": "Germany"},
        "DXB": {"name": "Dubai International", "city": "Dubai", "country": "UAE"},
        "SIN": {"name": "Changi Airport", "city": "Singapore", "country": "Singapore"},
        "SYD": {"name": "Sydney Airport", "city": "Sydney", "country": "Australia"}
    }
    
    # Popular routes from Doha
    routes = [
        ("DOH", "LHR"), ("DOH", "JFK"), ("DOH", "BKK"), ("DOH", "DEL"),
        ("DOH", "NBO"), ("DOH", "FRA"), ("DOH", "DXB"), ("DOH", "SIN"),
        ("LHR", "JFK"), ("BKK", "SIN"), ("FRA", "JFK"), ("DEL", "BKK")
    ]
    
    # Aircraft types with capacities
    aircraft = [
        ("B77W", "Boeing 777-300ER", 370),
        ("A359", "Airbus A350-900", 325),
        ("B789", "Boeing 787-9", 290),
        ("A333", "Airbus A330-300", 275),
        ("B738", "Boeing 737-800", 180)
    ]
    
    # Generate flight dates between start and end date
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    date_range = (end - start).days
    
    flights_data = []
    
    print(f"üìä Generating {num_flights:,} flights...")
    pbar = tqdm(total=num_flights, desc="Creating flights", unit="flights")
    
    for i in range(1, num_flights + 1):
        # Random flight date
        flight_date = start + timedelta(days=random.randint(0, date_range))
        
        # Random route
        origin, destination = random.choice(routes)
        
        # Random aircraft
        aircraft_code, aircraft_name, capacity = random.choice(aircraft)
        
        # Flight number (QR = Qatar Airways code)
        flight_number = f"QR{random.randint(100, 999)}"
        
        # Generate random departure time (between 00:00 and 23:30)
        departure_hour = random.randint(0, 23)
        departure_minute = random.choice([0, 15, 30, 45])
        departure_time = flight_date.replace(hour=departure_hour, minute=departure_minute)
        
        # Flight duration based on route (in hours)
        route_key = f"{origin}-{destination}"
        route_durations = {
            "DOH-LHR": 7, "DOH-JFK": 12, "DOH-BKK": 7, "DOH-DEL": 3,
            "DOH-NBO": 5, "DOH-FRA": 6, "DOH-DXB": 1, "DOH-SIN": 8,
            "LHR-JFK": 8, "BKK-SIN": 2, "FRA-JFK": 9, "DEL-BKK": 4
        }
        duration_hours = route_durations.get(route_key, random.randint(2, 12))
        
        # Arrival time
        arrival_time = departure_time + timedelta(hours=duration_hours)
        
        # Create flight record
        flights_data.append({
            "flight_id": i,
            "flight_number": flight_number,
            "flight_date": flight_date.date(),
            "departure_time": departure_time,
            "arrival_time": arrival_time,
            "origin": origin,
            "destination": destination,
            "aircraft_code": aircraft_code,
            "aircraft_name": aircraft_name,
            "capacity": capacity,
            "duration_hours": duration_hours,
            "airline": "Qatar Airways"
        })
        
        pbar.update(1)
    
    pbar.close()
    
    # Create DataFrame
    flights_df = pd.DataFrame(flights_data)
    
    # Convert datetime columns
    flights_df['flight_date'] = pd.to_datetime(flights_df['flight_date'])
    flights_df['departure_time'] = pd.to_datetime(flights_df['departure_time'])
    flights_df['arrival_time'] = pd.to_datetime(flights_df['arrival_time'])
    
    print(f"\n‚úÖ Generated {len(flights_df):,} flights")
    print(f"   Date range: {flights_df['flight_date'].min().date()} to {flights_df['flight_date'].max().date()}")
    print(f"   Routes: {flights_df['origin'].nunique()} origins ‚Üí {flights_df['destination'].nunique()} destinations")
    
    return flights_df

# ============================================================================
# 2. BOOKINGS DATA GENERATION
# ============================================================================
def generate_bookings_data(flights_df, max_bookings=32000, output_csv='bookings.csv'):
    """
    Generate realistic bookings data and save to CSV file
    
    Parameters:
    -----------
    flights_df : pandas DataFrame
        DataFrame containing flight information
    max_bookings : int
        Maximum number of bookings to generate (default: 32,000)
    output_csv : str
        Output CSV file name/path
    """
    
    print("\n" + "=" * 70)
    print("üìñ GENERATING BOOKINGS DATA")
    print("=" * 70)
    print(f"üöÄ Starting bookings data generation...")
    print(f"   Max bookings to generate: {max_bookings:,}")
    print(f"   Output file: {output_csv}")
    print("-" * 70)
    
    # Define fare classes with code, name, and base price
    fare_classes = [
        ("F", "First", 5000),
        ("J", "Business", 3200),
        ("W", "Premium Economy", 1500),
        ("Y", "Economy", 900),
        ("M", "Economy", 600),
        ("L", "Economy", 350)
    ]
    
    # Sales channels and point-of-sale countries
    channels = ["Direct", "OTA", "GDS", "Corporate", "Travel Agent"]
    pos_countries = ["Qatar", "UK", "USA", "Kenya", "India", "Thailand", 
                     "Germany", "UAE", "Australia", "Singapore", "France", "Japan"]
    
    # Passenger names for realistic data
    first_names = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer", 
                   "Michael", "Linda", "William", "Elizabeth", "David", "Susan",
                   "Richard", "Jessica", "Joseph", "Sarah", "Thomas", "Karen",
                   "Charles", "Nancy", "Ahmed", "Fatima", "Mohammed", "Aisha",
                   "Ali", "Zainab", "Hassan", "Mariam", "Omar", "Layla"]
    
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia",
                  "Miller", "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez",
                  "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore",
                  "Jackson", "Martin", "Al-Sayed", "Al-Khalifa", "Al-Thani", "Khan",
                  "Patel", "Singh", "Chen", "Wang", "Tanaka", "Kim"]
    
    # Initialize storage
    booking_records = []
    passenger_records = []
    booking_id = 1
    passenger_id = 1
    
    # Calculate statistics for progress bar
    total_flights = len(flights_df)
    total_potential_bookings = min(max_bookings, total_flights * 30)
    
    print(f"üìä Processing {total_flights:,} flights...")
    print(f"üìà Estimated bookings: {total_potential_bookings:,}")
    
    # Create progress bar
    pbar = tqdm(total=total_potential_bookings, 
                desc="Generating bookings", 
                unit="bookings",
                bar_format='{l_bar}{bar:50}{r_bar}{bar:-50b}')
    
    # Generate bookings for each flight
    for _, flight in flights_df.iterrows():
        if booking_id > max_bookings:
            break
            
        # Random number of bookings per flight (based on capacity)
        capacity_utilization = random.uniform(0.7, 0.95)  # 70-95% full
        num_bookings = int(flight["capacity"] * capacity_utilization / random.uniform(1.5, 3.0))
        num_bookings = max(5, min(num_bookings, 50))  # Limit between 5 and 50
        
        for _ in range(num_bookings):
            if booking_id > max_bookings:
                break
                
            # Random fare class selection with weighted probabilities
            fare_weights = [0.05, 0.15, 0.10, 0.30, 0.25, 0.15]  # Higher probability for economy
            fare = random.choices(fare_classes, weights=fare_weights, k=1)[0]
            
            # Random booking date (1-180 days before flight, weighted toward closer dates)
            days_before_options = list(range(1, 181))
            # Weight: earlier bookings less likely than last-minute
            weights = [1/(i**0.7) for i in days_before_options]
            days_before = random.choices(days_before_options, weights=weights, k=1)[0]
            
            # Calculate booking date and time
            booking_date = flight["flight_date"] - timedelta(days=days_before)
            
            # Add random time to booking date
            booking_time = booking_date.replace(
                hour=random.randint(0, 23),
                minute=random.choice([0, 15, 30, 45])
            )
            
            # Price calculation with seasonal variation and booking time factor
            base_price = fare[2]
            
            # Seasonal factor (higher in summer and holidays)
            month = flight["flight_date"].month
            if month in [6, 7, 8, 12]:  # Summer and December
                season_factor = random.uniform(1.1, 1.3)
            elif month in [1, 2, 9]:  # Lower season
                season_factor = random.uniform(0.9, 1.0)
            else:
                season_factor = random.uniform(1.0, 1.1)
            
            # Booking time factor (last-minute bookings more expensive)
            if days_before <= 7:
                time_factor = random.uniform(1.2, 1.5)
            elif days_before <= 14:
                time_factor = random.uniform(1.1, 1.3)
            else:
                time_factor = random.uniform(0.85, 1.15)
            
            # Channel discount factor
            channel = random.choice(channels)
            channel_discounts = {
                "Direct": random.uniform(0.95, 1.05),  # No discount
                "OTA": random.uniform(0.85, 0.95),     # 5-15% discount
                "GDS": random.uniform(0.88, 0.98),     # 2-12% discount
                "Corporate": random.uniform(0.75, 0.85), # 15-25% discount
                "Travel Agent": random.uniform(0.90, 1.0) # 0-10% discount
            }
            channel_factor = channel_discounts[channel]
            
            # Calculate final price
            final_price = round(base_price * season_factor * time_factor * channel_factor, 2)
            
            # Random passenger count
            passenger_count = random.choices([1, 2, 3, 4], weights=[0.4, 0.35, 0.15, 0.1], k=1)[0]
            
            # POS country (biased toward origin/destination countries)
            origin_country = "Qatar" if flight["origin"] == "DOH" else "Other"
            destination_country_map = {
                "LHR": "UK", "JFK": "USA", "BKK": "Thailand", 
                "DEL": "India", "NBO": "Kenya", "FRA": "Germany"
            }
            destination_country = destination_country_map.get(flight["destination"], random.choice(pos_countries))
            
            # Weight POS country selection
            pos_country_choices = [destination_country, origin_country, random.choice(pos_countries)]
            pos_country = random.choices(pos_country_choices, weights=[0.5, 0.3, 0.2], k=1)[0]
            
            # Generate passenger records for this booking
            passengers_in_booking = []
            for pax_num in range(passenger_count):
                # Random passenger details
                gender = random.choice(["M", "F"])
                first_name = random.choice([n for n in first_names if 
                                           (gender == "M" and n not in ["Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Susan", "Jessica", "Sarah", "Karen", "Nancy", "Fatima", "Aisha", "Zainab", "Mariam", "Layla"]) or
                                           (gender == "F" and n not in ["James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph", "Thomas", "Charles", "Ahmed", "Mohammed", "Ali", "Hassan", "Omar"])])
                last_name = random.choice(last_names)
                dob = flight["flight_date"] - timedelta(days=random.randint(18*365, 70*365))
                
                # Generate random passport number
                passport = f"{random.choice(['A', 'B', 'C', 'P'])}{random.randint(100000, 999999)}"
                
                passenger_records.append({
                    "passenger_id": passenger_id,
                    "booking_id": booking_id,
                    "first_name": first_name,
                    "last_name": last_name,
                    "gender": gender,
                    "date_of_birth": dob.date(),
                    "passport_number": passport,
                    "nationality": pos_country,
                    "seat_number": f"{random.choice(['A', 'B', 'C', 'D', 'E', 'F'])}{random.randint(1, 40)}"
                })
                
                passengers_in_booking.append(f"{first_name} {last_name}")
                passenger_id += 1
            
            # Create booking record
            booking_records.append({
                "booking_id": booking_id,
                "flight_id": flight["flight_id"],
                "booking_datetime": booking_time,
                "travel_date": flight["flight_date"].date(),
                "fare_class": fare[0],
                "cabin": fare[1],
                "price": final_price,
                "passenger_count": passenger_count,
                "passenger_names": ", ".join(passengers_in_booking),
                "pos_country": pos_country,
                "sales_channel": channel,
                "payment_method": random.choice(["Credit Card", "Debit Card", "Bank Transfer", "Corporate Account"]),
                "booking_status": random.choices(["Confirmed", "Cancelled"], weights=[0.92, 0.08], k=1)[0],
                "cancellation_date": None if random.random() > 0.08 else booking_time + timedelta(days=random.randint(1, days_before-1))
            })
            
            booking_id += 1
            pbar.update(1)
            
            # Update progress bar description occasionally
            if booking_id % 1000 == 0:
                pbar.set_description(f"Creating bookings (ID: {booking_id:,})")
    
    # Close progress bar
    pbar.close()
    
    # Create DataFrames
    bookings_df = pd.DataFrame(booking_records)
    passengers_df = pd.DataFrame(passenger_records)
    
    # Convert date columns
    bookings_df['booking_datetime'] = pd.to_datetime(bookings_df['booking_datetime'])
    bookings_df['travel_date'] = pd.to_datetime(bookings_df['travel_date'])
    passengers_df['date_of_birth'] = pd.to_datetime(passengers_df['date_of_birth'])
    
    # Calculate cancellation rate
    cancelled_count = bookings_df[bookings_df['booking_status'] == 'Cancelled'].shape[0]
    cancellation_rate = (cancelled_count / len(bookings_df)) * 100 if len(bookings_df) > 0 else 0
    
    print(f"\n‚úÖ Generated {len(bookings_df):,} booking records")
    print(f"‚úÖ Generated {len(passengers_df):,} passenger records")
    print(f"üìä Cancellation rate: {cancellation_rate:.1f}% ({cancelled_count:,} cancelled bookings)")
    
    return bookings_df, passengers_df

# ============================================================================
# 3. CSV EXPORT FUNCTION
# ============================================================================
def export_to_csv(bookings_df, passengers_df=None, base_filename="airline_data", 
                  export_passengers=True, compress=False):
    """
    Export data to CSV files
    
    Parameters:
    -----------
    bookings_df : pandas DataFrame
        Bookings data
    passengers_df : pandas DataFrame, optional
        Passengers data
    base_filename : str
        Base name for output files
    export_passengers : bool
        Whether to export passengers data
    compress : bool
        Whether to compress the CSV files
    """
    
    print("\n" + "=" * 70)
    print("üíæ EXPORTING DATA TO CSV")
    print("=" * 70)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Export bookings data
    bookings_filename = f"{base_filename}_bookings_{timestamp}.csv"
    if compress:
        bookings_filename += ".gz"
        compression = 'gzip'
    else:
        compression = None
    
    print(f"üìÅ Exporting bookings data to {bookings_filename}...")
    bookings_df.to_csv(bookings_filename, index=False, compression=compression)
    
    # Calculate file size
    bookings_size = os.path.getsize(bookings_filename)
    
    # Export passengers data if provided
    passengers_filename = None
    if export_passengers and passengers_df is not None:
        passengers_filename = f"{base_filename}_passengers_{timestamp}.csv"
        if compress:
            passengers_filename += ".gz"
        
        print(f"üìÅ Exporting passengers data to {passengers_filename}...")
        passengers_df.to_csv(passengers_filename, index=False, compression=compression)
        
        passengers_size = os.path.getsize(passengers_filename)
    
    print("\n‚úÖ Export complete!")
    print("-" * 70)
    
    # Format file sizes
    def format_size(size_bytes):
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size_bytes < 1024.0:
                return f"{size_bytes:.2f} {unit}"
            size_bytes /= 1024.0
        return f"{size_bytes:.2f} TB"
    
    print(f"üìä BOOKINGS FILE:")
    print(f"   Name: {bookings_filename}")
    print(f"   Size: {format_size(bookings_size)}")
    print(f"   Rows: {len(bookings_df):,}")
    print(f"   Columns: {len(bookings_df.columns)}")
    
    if export_passengers and passengers_df is not None:
        print(f"\nüìä PASSENGERS FILE:")
        print(f"   Name: {passengers_filename}")
        print(f"   Size: {format_size(passengers_size)}")
        print(f"   Rows: {len(passengers_df):,}")
        print(f"   Columns: {len(passengers_df.columns)}")
    
    print(f"\nüìÅ Files saved in: {os.path.abspath('.')}")
    
    return bookings_filename, passengers_filename

# ============================================================================
# 4. DATA ANALYSIS & SUMMARY
# ============================================================================
def analyze_data(bookings_df, passengers_df=None):
    """
    Analyze and display summary statistics of the generated data
    """
    
    print("\n" + "=" * 70)
    print("üìà DATA ANALYSIS & SUMMARY")
    print("=" * 70)
    
    # Bookings analysis
    print(f"\nüìñ BOOKINGS SUMMARY:")
    print(f"   Total bookings: {len(bookings_df):,}")
    print(f"   Date range: {bookings_df['travel_date'].min().date()} to {bookings_df['travel_date'].max().date()}")
    print(f"   Booking lead time (avg): {(bookings_df['travel_date'] - bookings_df['booking_datetime'].dt.date).mean().days:.1f} days")
    
    # Revenue analysis
    total_revenue = bookings_df[bookings_df['booking_status'] == 'Confirmed']['price'].sum()
    avg_price = bookings_df[bookings_df['booking_status'] == 'Confirmed']['price'].mean()
    
    print(f"\nüí∞ REVENUE ANALYSIS:")
    print(f"   Total revenue: ${total_revenue:,.2f}")
    print(f"   Average booking price: ${avg_price:.2f}")
    print(f"   Min price: ${bookings_df['price'].min():.2f}")
    print(f"   Max price: ${bookings_df['price'].max():.2f}")
    
    # Fare class distribution
    print(f"\nüé´ FARE CLASS DISTRIBUTION:")
    fare_dist = bookings_df['fare_class'].value_counts().sort_index()
    for fare_class, count in fare_dist.items():
        percentage = (count / len(bookings_df)) * 100
        cabin = bookings_df[bookings_df['fare_class'] == fare_class]['cabin'].iloc[0]
        avg_fare = bookings_df[bookings_df['fare_class'] == fare_class]['price'].mean()
        print(f"   {fare_class} ({cabin}): {count:,} bookings ({percentage:.1f}%) | Avg: ${avg_fare:.2f}")
    
    # Sales channel analysis
    print(f"\nüõí SALES CHANNEL ANALYSIS:")
    channel_dist = bookings_df['sales_channel'].value_counts()
    for channel, count in channel_dist.items():
        percentage = (count / len(bookings_df)) * 100
        avg_channel_price = bookings_df[bookings_df['sales_channel'] == channel]['price'].mean()
        print(f"   {channel}: {count:,} bookings ({percentage:.1f}%) | Avg price: ${avg_channel_price:.2f}")
    
    # Booking status
    print(f"\nüìä BOOKING STATUS:")
    status_dist = bookings_df['booking_status'].value_counts()
    for status, count in status_dist.items():
        percentage = (count / len(bookings_df)) * 100
        print(f"   {status}: {count:,} bookings ({percentage:.1f}%)")
    
    # POS country analysis
    print(f"\nüåç TOP 5 POS COUNTRIES:")
    pos_dist = bookings_df['pos_country'].value_counts().head()
    for country, count in pos_dist.items():
        percentage = (count / len(bookings_df)) * 100
        print(f"   {country}: {count:,} bookings ({percentage:.1f}%)")
    
    # Passengers analysis if available
    if passengers_df is not None:
        print(f"\nüë• PASSENGERS SUMMARY:")
        print(f"   Total passengers: {len(passengers_df):,}")
        print(f"   Unique nationalities: {passengers_df['nationality'].nunique()}")
        
        # Gender distribution
        gender_dist = passengers_df['gender'].value_counts()
        print(f"   Gender distribution:")
        for gender, count in gender_dist.items():
            percentage = (count / len(passengers_df)) * 100
            print(f"     {gender}: {count:,} passengers ({percentage:.1f}%)")
    
    # Monthly revenue trend
    print(f"\nüìÖ MONTHLY REVENUE TREND:")
    bookings_df['travel_month'] = bookings_df['travel_date'].dt.to_period('M')
    monthly_revenue = bookings_df[bookings_df['booking_status'] == 'Confirmed'].groupby('travel_month')['price'].sum()
    for month, revenue in monthly_revenue.items():
        print(f"   {month}: ${revenue:,.2f}")
    
    # Sample data preview
    print(f"\nüëÅÔ∏è  SAMPLE DATA (first 3 bookings):")
    print(bookings_df.head(3).to_string())
    
    if passengers_df is not None:
        print(f"\nüëÅÔ∏è  SAMPLE PASSENGERS (first 3):")
        print(passengers_df.head(3).to_string())

# ============================================================================
# 5. MAIN FUNCTION
# ============================================================================
def main():
    """
    Main execution function
    """
    
    print("=" * 70)
    print("üöÄ AIRLINE BOOKINGS DATA GENERATOR")
    print("=" * 70)
    print("This script generates realistic airline booking data.")
    print("=" * 70)
    
    # Configuration
    config = {
        "num_flights": 200,           # Number of flights to generate
        "max_bookings": 10000,        # Maximum bookings to generate
        "start_date": "2024-01-01",   # Start date for flights
        "end_date": "2024-06-30",     # End date for flights
        "base_filename": "airline",   # Base name for output files
        "export_passengers": True,    # Export passengers data
        "compress_csv": False         # Compress CSV files
    }
    
    print("\n‚öôÔ∏è  CONFIGURATION:")
    for key, value in config.items():
        print(f"   {key}: {value}")
    
    # Ask for confirmation
    print("\n" + "-" * 70)
    response = input("üëâ Press Enter to start generation or 'q' to quit: ")
    if response.lower() == 'q':
        print("Exiting...")
        return
    
    try:
        # Step 1: Generate flights data
        flights_df = generate_flights_data(
            num_flights=config["num_flights"],
            start_date=config["start_date"],
            end_date=config["end_date"]
        )
        
        # Step 2: Generate bookings data
        bookings_df, passengers_df = generate_bookings_data(
            flights_df=flights_df,
            max_bookings=config["max_bookings"],
            output_csv="temp_bookings.csv"  # Temporary file
        )
        
        # Step 3: Export to CSV
        bookings_file, passengers_file = export_to_csv(
            bookings_df=bookings_df,
            passengers_df=passengers_df if config["export_passengers"] else None,
            base_filename=config["base_filename"],
            export_passengers=config["export_passengers"],
            compress=config["compress_csv"]
        )
        
        # Step 4: Analyze data
        analyze_data(bookings_df, passengers_df if config["export_passengers"] else None)
        
        # Step 5: Optional - Save flights data
        save_flights = input("\nüëâ Save flights data as CSV? (y/n): ")
        if save_flights.lower() == 'y':
            flights_file = f"{config['base_filename']}_flights_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            flights_df.to_csv(flights_file, index=False)
            print(f"‚úÖ Flights data saved to: {flights_file}")
        
        print("\n" + "=" * 70)
        print("üéâ GENERATION COMPLETE!")
        print("=" * 70)
        print(f"üìÅ Files generated:")
        print(f"   ‚Ä¢ Bookings: {bookings_file}")
        if config["export_passengers"]:
            print(f"   ‚Ä¢ Passengers: {passengers_file}")
        print(f"\nüìä Total records generated:")
        print(f"   ‚Ä¢ Flights: {len(flights_df):,}")
        print(f"   ‚Ä¢ Bookings: {len(bookings_df):,}")
        if config["export_passengers"]:
            print(f"   ‚Ä¢ Passengers: {len(passengers_df):,}")
        print("=" * 70)
        
    except Exception as e:
        print(f"\n‚ùå ERROR: {e}")
        print("Generation failed. Please check your configuration.")
        import traceback
        traceback.print_exc()

# ============================================================================
# 6. COMMAND LINE INTERFACE
# ============================================================================
if __name__ == "__main__":
    # Check for command line arguments
    if len(sys.argv) > 1:
        # Simple command line interface
        import argparse
        
        parser = argparse.ArgumentParser(description='Generate airline bookings data')
        parser.add_argument('--flights', type=int, default=200, help='Number of flights to generate')
        parser.add_argument('--bookings', type=int, default=10000, help='Maximum bookings to generate')
        parser.add_argument('--start', type=str, default='2024-01-01', help='Start date (YYYY-MM-DD)')
        parser.add_argument('--end', type=str, default='2024-06-30', help='End date (YYYY-MM-DD)')
        parser.add_argument('--output', type=str, default='airline', help='Base output filename')
        parser.add_argument('--no-passengers', action='store_true', help='Skip passengers data generation')
        parser.add_argument('--compress', action='store_true', help='Compress output CSV files')
        
        args = parser.parse_args()
        
        # Run with command line arguments
        flights_df = generate_flights_data(
            num_flights=args.flights,
            start_date=args.start,
            end_date=args.end
        )
        
        bookings_df, passengers_df = generate_bookings_data(
            flights_df=flights_df,
            max_bookings=args.bookings,
            output_csv="temp_bookings.csv"
        )
        
        export_to_csv(
            bookings_df=bookings_df,
            passengers_df=passengers_df if not args.no_passengers else None,
            base_filename=args.output,
            export_passengers=not args.no_passengers,
            compress=args.compress
        )
        
        analyze_data(bookings_df, passengers_df if not args.no_passengers else None)
    else:
        # Run interactive mode
        main()

usage: ipykernel_launcher.py [-h] [--flights FLIGHTS] [--bookings BOOKINGS] [--start START] [--end END]
                             [--output OUTPUT] [--no-passengers] [--compress]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\superadmin\AppData\Roaming\jupyter\runtime\kernel-011d4f0e-6cd1-4e08-8961-88e5c67724a9.json


SystemExit: 2

In [9]:
# %% CELL 1: Imports and Setup
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from tqdm import tqdm
import os

print("‚úÖ Imports successful!")

# %% CELL 2: Configuration
config = {
    "num_flights": 100,
    "max_bookings": 5000,
    "start_date": "2024-01-01",
    "end_date": "2024-03-31",
    "output_file": "my_bookings.csv"
}

print("‚öôÔ∏è  Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

# %% CELL 3: Generate Flights
print("\n‚úàÔ∏è  Generating flights...")

# Generate flights data
flights_data = []
start_date = datetime.strptime(config["start_date"], "%Y-%m-%d")
end_date = datetime.strptime(config["end_date"], "%Y-%m-%d")
date_range = (end_date - start_date).days

for i in range(1, config["num_flights"] + 1):
    flight_date = start_date + timedelta(days=random.randint(0, date_range))
    flights_data.append({
        "flight_id": i,
        "flight_date": flight_date,
        "origin": random.choice(["DOH", "LHR", "JFK", "BKK"]),
        "destination": random.choice(["LHR", "JFK", "BKK", "DEL"]),
        "capacity": random.choice([180, 275, 290, 370])
    })

flights_df = pd.DataFrame(flights_data)
print(f"‚úÖ Generated {len(flights_df)} flights")

# %% CELL 4: Generate Bookings
print("\nüìñ Generating bookings...")

fare_classes = [("F", "First", 5000), ("J", "Business", 3200), 
                ("Y", "Economy", 900), ("M", "Economy", 600), ("L", "Economy", 350)]
channels = ["Direct", "OTA", "GDS", "Corporate"]
countries = ["Qatar", "UK", "USA", "India", "Germany"]

booking_records = []
booking_id = 1

pbar = tqdm(total=min(config["max_bookings"], config["num_flights"] * 25))

for _, flight in flights_df.iterrows():
    if booking_id > config["max_bookings"]:
        break
        
    num_bookings = random.randint(5, 20)
    
    for _ in range(num_bookings):
        if booking_id > config["max_bookings"]:
            break
            
        fare = random.choice(fare_classes)
        days_before = random.randint(1, 90)
        
        booking_records.append([
            booking_id,
            flight["flight_id"],
            flight["flight_date"] - timedelta(days=days_before),
            flight["flight_date"],
            fare[0],
            fare[1],
            round(fare[2] * random.uniform(0.85, 1.15), 2),
            random.randint(1, 3),
            random.choice(countries),
            random.choice(channels)
        ])
        
        booking_id += 1
        pbar.update(1)

pbar.close()

# Create DataFrame
bookings_df = pd.DataFrame(booking_records, columns=[
    "booking_id", "flight_id", "booking_date", "travel_date",
    "fare_class", "cabin", "price", "passenger_count",
    "pos_country", "sales_channel"
])

print(f"‚úÖ Generated {len(bookings_df)} bookings")

# %% CELL 5: Save to CSV
print(f"\nüíæ Saving to {config['output_file']}...")
bookings_df.to_csv(config["output_file"], index=False)

print(f"‚úÖ File saved: {config['output_file']}")
print(f"üìä File size: {os.path.getsize(config['output_file']) / 1024:.1f} KB")

# Show preview
print("\nüëÅÔ∏è  Data preview:")
display(bookings_df.head())

# Show statistics
print("\nüìà Summary statistics:")
print(f"   Total bookings: {len(bookings_df):,}")
print(f"   Total revenue: ${bookings_df['price'].sum():,.2f}")
print(f"   Average price: ${bookings_df['price'].mean():.2f}")
print(f"   Date range: {bookings_df['booking_date'].min().date()} to {bookings_df['booking_date'].max().date()}")

‚úÖ Imports successful!
‚öôÔ∏è  Configuration:
  num_flights: 100
  max_bookings: 5000
  start_date: 2024-01-01
  end_date: 2024-03-31
  output_file: my_bookings.csv

‚úàÔ∏è  Generating flights...
‚úÖ Generated 100 flights

üìñ Generating bookings...


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé                                    | 1275/2500 [00:00<00:00, 18237.10it/s]

‚úÖ Generated 1275 bookings

üíæ Saving to my_bookings.csv...
‚úÖ File saved: my_bookings.csv
üìä File size: 76.0 KB

üëÅÔ∏è  Data preview:





Unnamed: 0,booking_id,flight_id,booking_date,travel_date,fare_class,cabin,price,passenger_count,pos_country,sales_channel
0,1,1,2024-01-08,2024-01-13,L,Economy,361.18,2,India,GDS
1,2,1,2023-12-20,2024-01-13,J,Business,3635.54,2,Qatar,GDS
2,3,1,2023-12-06,2024-01-13,J,Business,3170.07,2,India,GDS
3,4,1,2024-01-08,2024-01-13,J,Business,3012.8,3,USA,GDS
4,5,1,2023-11-11,2024-01-13,F,First,5187.62,3,India,GDS



üìà Summary statistics:
   Total bookings: 1,275
   Total revenue: $2,645,377.81
   Average price: $2074.81
   Date range: 2023-10-05 to 2024-03-30
