In [15]:
import pandas as pd
import numpy as np
import pytz
from datetime import datetime

In [16]:
# Load data
df = pd.read_csv('fully_wrangled_falcon_launches_v4.csv')
df['date_and_time_utc'] = pd.to_datetime(df['date_and_time_utc'])

In [17]:
# Sort by date for temporal features
df = df.sort_values('date_and_time_utc').reset_index(drop=True)

In [18]:
# 1. days_since_last_launch
df['days_since_last_launch'] = df['date_and_time_utc'].diff().dt.total_seconds() / (24 * 3600)
df['days_since_last_launch'] = df['days_since_last_launch'].fillna(0).astype('float32')

In [19]:
# 2. is_starlink
df['is_starlink'] = (
    df['payload'].str.contains('Starlink', case=False, na=False) |
    ((df['customer'] == 'Spacex') & (df['orbit'].isin(['Leo', 'Sso'])))
).astype('bool')

In [20]:
# 3. booster_reuse_count
df['booster_reuse_count'] = df.groupby('version_booster').cumcount().astype('int32')

In [21]:
# 4. launch_site_encoded
site_mapping = {
    'Cape Canaveral,Slc‑40': 'CC_SLC40',
    'Vandenberg,Slc‑4E': 'V_SLC4E',
    'Kennedy,Lc‑39A': 'K_LC39A'
}
df['launch_site_encoded'] = df['launch_site'].map(site_mapping).astype('category')

In [22]:
# 5. payload_mass_category
bins = [0, 5000, 15000, float('inf')]
labels = ['Light', 'Medium', 'Heavy']
df['payload_mass_category'] = pd.cut(
    df['payload_mass'], bins=bins, labels=labels, include_lowest=True
).astype('category')

In [23]:
# 6. is_crew_mission
df['is_crew_mission'] = (
    df['payload'].str.contains('Crew|Ax-', case=False, na=False) |
    (df['customer'].str.contains('Nasa\(Cts\)|Axiom Space', case=False, na=False) &
     (df['orbit'] == 'Leo(Iss)'))
).astype('bool')

In [24]:
# 7. orbit_type_encoded
orbit_mapping = {
    'Leo': 'Low_Earth', 'Sso': 'Low_Earth', 'Polarleo': 'Low_Earth',
    'Gto': 'Geostationary', 'Geo': 'Geostationary',
    'Meo': 'Medium_Earth',
    'Tli': 'Deep_Space', 'Heliocentric': 'Deep_Space', 'Sun–Earth L2Injection': 'Deep_Space'
}
df['orbit_type_encoded'] = df['orbit'].map(orbit_mapping).astype('category')

In [26]:
# 8. is_night_launch 
def is_night_launch(row):
    utc_time = row['date_and_time_utc']
    # Localize to UTC first
    utc_time = utc_time.tz_localize('UTC')
    if row['launch_site'] in ['Cape Canaveral,Slc‑40', 'Kennedy,Lc‑39A']:
        local_time = utc_time.astimezone(pytz.timezone('America/New_York'))
    else:  # Vandenberg
        local_time = utc_time.astimezone(pytz.timezone('America/Los_Angeles'))
    return 18 <= local_time.hour or local_time.hour < 6

df['is_night_launch'] = df.apply(is_night_launch, axis=1).astype('bool')

In [27]:
# 9. cumulative_launch_count
df['cumulative_launch_count'] = np.arange(1, len(df) + 1).astype('int32')

In [28]:
# 10. customer_type
def get_customer_type(customer):
    if customer == 'Spacex':
        return 'Internal'
    if customer in ['Nasa(Cts)', 'Nasa(Crs)', 'Nasa(Lsp)', 'Nasa(Ccp)', 'Ussf', 'Nro', 'Sda']:
        return 'Government'
    return 'Commercial'

df['customer_type'] = df['customer'].apply(get_customer_type).astype('category')

In [29]:
# 11. launch_success_rate_rolling
df['launch_success_rate_rolling'] = (
    df['launch_outcome'].eq('Success').shift(1).rolling(window=10, min_periods=1).mean()
).fillna(1.0).astype('float32')

In [30]:
# 12. booster_landing_success_rate
def compute_booster_landing_rate(group):
    return group.eq('Success').shift(1).expanding().mean().fillna(1.0)

df['booster_landing_success_rate'] = df.groupby('version_booster')['booster_landing'].apply(
    compute_booster_landing_rate
).astype('float32')

In [31]:
# Validate features
print("Feature Validation:")
for col in ['days_since_last_launch', 'booster_reuse_count', 'cumulative_launch_count',
            'launch_success_rate_rolling', 'booster_landing_success_rate']:
    print(f"\n{col} Summary:")
    print(df[col].describe())
for col in ['is_starlink', 'is_crew_mission', 'is_night_launch', 'launch_site_encoded',
            'payload_mass_category', 'orbit_type_encoded', 'customer_type']:
    print(f"\n{col} Counts:")
    print(df[col].value_counts())

Feature Validation:

days_since_last_launch Summary:
count    288.000000
mean       3.048999
std        2.259413
min        0.000000
25%        1.231944
50%        2.871875
75%        4.274653
max       15.131945
Name: days_since_last_launch, dtype: float64

booster_reuse_count Summary:
count    288.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: booster_reuse_count, dtype: float64

cumulative_launch_count Summary:
count    288.000000
mean     144.500000
std       83.282651
min        1.000000
25%       72.750000
50%      144.500000
75%      216.250000
max      288.000000
Name: cumulative_launch_count, dtype: float64

launch_success_rate_rolling Summary:
count    288.000000
mean       0.996528
std        0.018339
min        0.900000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: launch_success_rate_rolling, dtype: float64

booster_landing_success_rate Summary:
count    288.0
mean    

In [32]:
# Optimize data types
dtypes = {
    'days_since_last_launch': 'float32',
    'is_starlink': 'bool',
    'booster_reuse_count': 'int32',
    'launch_site_encoded': 'category',
    'payload_mass_category': 'category',
    'is_crew_mission': 'bool',
    'orbit_type_encoded': 'category',
    'is_night_launch': 'bool',
    'cumulative_launch_count': 'int32',
    'customer_type': 'category',
    'launch_success_rate_rolling': 'float32',
    'booster_landing_success_rate': 'float32'
}
df = df.astype(dtypes)

In [33]:
# Save updated dataset
df.to_csv('fully_wrangled_falcon_launches_v6.csv', index=False)
print("\nUpdated dataset saved to 'fully_wrangled_falcon_launches_v6.csv'")


Updated dataset saved to 'fully_wrangled_falcon_launches_v6.csv'
