# Third Notebook: Data Preparation

## Step 1: Load Data 
**And creating the file** ***learning_data.csv***


**1. Load the Dataset.**<br>


In [192]:
import pandas as pd

# Load the dataset
data_path = '../data/updated_Airplane_Crashes.csv'  # Replace with the actual path
data = pd.read_csv(data_path)

# Display the first few rows to understand the structure
print(data.head())

# Check for missing values
print(data.isnull().sum())

        Date   Time                            Location  \
0  9/17/1908  17:18            Fort Myer, Virginia, USA   
1   9/7/1909  00:00             Juvisy-sur-Orge, France   
2  7/12/1912   6:30      Atlantic City, New Jersey, USA   
3   8/6/1913  00:00  Victoria, British Columbia, Canada   
4   9/9/1913  18:30                  Over the North Sea   

                 Operator Flight #          Route                 AC Type  \
0    Military - U.S. Army      NaN  Demonstration        Wright Flyer III   
1                     NaN      NaN       Air show          Wright Byplane   
2    Military - U.S. Navy      NaN    Test flight               Dirigible   
3                 Private      NaN            NaN        Curtiss seaplane   
4  Military - German Navy      NaN            NaN  Zeppelin L-1 (airship)   

  Registration cn/ln  Aboard  Aboard Passangers  Aboard Crew  Fatalities  \
0          NaN     1     2.0                1.0          1.0         1.0   
1          SC1   NaN     1.0  

## Step 2: Encode Time into Vectors

**1. Define the categorize_time_vectorized function.**<br>
**2. Apply it to the Time column to generate Time_Vector.**

In [193]:
# Function to encode time into vectors
def categorize_time_vectorized(time_str):
    """Categorize time into a one-hot encoded vector."""
    if pd.isnull(time_str) or time_str.strip() == "00:00":
        return [0, 0, 0, 0, 1]  # Unknown
    try:
        hour = int(time_str.split(':')[0])
        if 5 <= hour < 12:
            return [1, 0, 0, 0, 0]  # Morning
        elif 12 <= hour < 17:
            return [0, 1, 0, 0, 0]  # Afternoon
        elif 17 <= hour < 21:
            return [0, 0, 1, 0, 0]  # Evening
        else:
            return [0, 0, 0, 1, 0]  # Night
    except ValueError:
        return [0, 0, 0, 0, 1]  # Unknown

# Apply the function to create the Time_Vector column
data['Time_Vector'] = data['Time'].apply(categorize_time_vectorized)

# Inspect the result
print(data[['Time', 'Time_Vector']].head())

    Time      Time_Vector
0  17:18  [0, 0, 1, 0, 0]
1  00:00  [0, 0, 0, 0, 1]
2   6:30  [1, 0, 0, 0, 0]
3  00:00  [0, 0, 0, 0, 1]
4  18:30  [0, 0, 1, 0, 0]


## Step 3: Simplify Location

**1. Extract the country from the Location column (assuming the country is after the last comma).**

In [194]:
# Simplify Location to retain only the country
def extract_country(location):
    """Extract the country from the location string."""
    if pd.isnull(location) or not isinstance(location, str):
        return 'Unknown'  # Assign 'Unknown' if location is null or invalid
    return location.split(',')[-1].strip()  # Take the last part after the comma

# Apply the function to the Location column
data['Location'] = data['Location'].apply(extract_country)

# Define a function to assign regions based on the provided locations
def map_location_to_region(location):
    """Map a location to its respective region."""
    if pd.isnull(location) or not isinstance(location, str):
        return 'Unknown'
    
    # Normalize location names
    location = location.strip().title()  # Normalize capitalization
    location = location.replace('U.S.A.', 'USA').replace('United States', 'USA')
    location = location.replace('United Kingdom', 'UK').replace('Ussr', 'Russia')
    location = location.replace('Democratic Republic Of The Congo', 'Congo')
    location = location.replace('Republic Of Djibouti', 'Djibouti')

    # List of locations for each region (more comprehensive)
    america = ['Usa', 'Canada', 'Mexico', 'Brazil', 'Argentina', 'Colombia', 'Venezuela', 'Puerto Rico', 'Chile', 'Ontario', 
               'Cuba', 'Ecuador', 'Peru', 'Panama', 'Paraguay', 'Guyana', 'Suriname', 'French Guyana', 'Trinidad', 'Bolivia', 
               'Uruguay', 'Gulf of Mexico', 'South Dakota', 'California', 'Tennessee', 'Florida', 'New York', 'Mississippi']
    europe = ['France', 'Germany', 'Uk', 'England', 'Spain', 'Italy', 'Netherlands', 'Belgium', 'Sweden', 'Switzerland', 'Portugal', 
              'Greece', 'Poland', 'Norway', 'Denmark', 'Finland', 'Austria', 'Czech Republic', 'Hungary', 'Bulgaria', 
              'Croatia', 'Romania', 'Ukraine', 'Slovenia', 'Latvia', 'Lithuania', 'Estonia', 'Ireland', 'Luxembourg', 
              'Monaco', 'Belgium', 'Iceland', 'Kosovo', 'Malta', 'Czechoslovakia', 'Wales', 'Scotland']
    asia = ['China', 'India', 'Japan', 'South Korea', 'North Korea', 'Indonesia', 'Pakistan', 'Bangladesh', 'Vietnam', 
            'Philippines', 'Thailand', 'Myanmar', 'Malaysia', 'Sri Lanka', 'Nepal', 'Cambodia', 'Laos', 'Singapore', 
            'Afghanistan', 'Tajikistan', 'Kazakhstan', 'Kyrgyzstan', 'Turkmenistan', 'Armenia', 'Azerbaijan', 'Georgia', 
            'Russia', 'Iran', 'Iraq', 'Syria', 'Jordan', 'Israel', 'Lebanon', 'Turkey', 'Saudi Arabia', 'Kuwait', 'Qatar', 
            'Uae', 'Oman', 'Yemen']
    africa = ['Morroco', 'South Africa', 'Egypt', 'Nigeria', 'Kenya', 'Ethiopia', 'Morocco', 'Algeria', 'Tunisia', 'Uganda', 'Angola', 
              'Ghana', 'Cameroon', 'Mozambique', 'Zambia', 'Zimbabwe', 'Mali', 'Malawi', 'Senegal', 'Somalia', 'Rwanda', 
              'Congo', 'Gabon', 'Liberia', 'Sierra Leone', 'Benin', 'Botswana', 'Mauritius', 'Namibia', 'Burkina Faso', 
              'Niger', 'Togo', 'Central African Republic', 'Ivory Coast', 'Burundi', 'Gambia', 'Lesotho']
    oceania = ['Australia', 'New Zealand', 'Papua New Guinea', 'Fiji', 'Solomon Islands', 'Vanuatu', 'Samoa', 'Tonga', 
               'Kiribati', 'Tuvalu', 'Nauru', 'Marshall Islands', 'Palau', 'Micronesia']
    unknown = ['Ocean', 'Sea', 'Off', 'Near', 'Islands', 'Coast', 'Channel', 'Mediterranean', 'Atlantic', 'Indian Ocean', 
               'Gulf of Oman', 'Gulf of Finland', 'Gulf of Sirte', 'Gulf of Mexico', 'Gulf of Karkinitsky', 'Over', 
               'Near Hong Kong', 'Off Gibraltar', 'Over the Pacific Ocean']
    
    # Assign regions dynamically based on keywords
    if location in america:
        return 'America'
    elif location in europe:
        return 'Europe'
    elif location in asia:
        return 'Asia'
    elif location in africa:
        return 'Africa'
    elif location in oceania:
        return 'Oceania'
    elif any(word in location for word in unknown):
        return 'Unknown'
    else:
        return 'Other'  # For unclassified or unknown locations

# Apply the function to map each location to a region
data['Region'] = data['Location'].apply(map_location_to_region)

# Step 1: Get unique regions
unique_regions = data['Region'].unique()
print(f"Unique Regions: {unique_regions}")

# Step 2: Function to encode region into a one-hot vector
def encode_region_to_vector(region):
    """Encode region into a one-hot vector based on unique regions."""
    vector = [0] * len(unique_regions)  # Initialize a zero vector
    if region in unique_regions:
        vector[list(unique_regions).index(region)] = 1  # Set 1 at the correct index
    return vector

# Step 3: Apply encoding to create a Region_Vector column
data['Region_Vector'] = data['Region'].apply(encode_region_to_vector)

# Inspect the result
print(data[['Region', 'Region_Vector']].head())


Unique Regions: ['America' 'Europe' 'Unknown' 'Other' 'Oceania' 'Asia' 'Africa']
    Region          Region_Vector
0  America  [1, 0, 0, 0, 0, 0, 0]
1   Europe  [0, 1, 0, 0, 0, 0, 0]
2  America  [1, 0, 0, 0, 0, 0, 0]
3  America  [1, 0, 0, 0, 0, 0, 0]
4  Unknown  [0, 0, 1, 0, 0, 0, 0]


## Step 4: Categorize Operator

**1. Define the classify_operator function based on the provided categories.**<br>
**2. Apply it to the Operator column.**

In [195]:
# Define operator categories
categories = {
    'Military': ['Military', 'Air Force', 'Navy', 'Army', 'Marine Corps', 'Defense', 'NATO'],
    'Private': ['Private', 'Charter', 'Taxi', 'Club', 'Skydiving', 'Service', 'Inc.', 'Helicopter', 'Helicopters', 'Company'],
    'Commercial': ['Aerolift', 'Aviastar', 'Rwandair', 'Wizzair', 'Airlink', 'Azzi',  'Alaska', 'Filair', 'Airlines', 'Airways', 'Lifeflight', 'Aéropro', 'Aeroflot', 'Aeroplane', 'UTAir', 'Flydubai', 'Emirates', 'Aviation', 'Lineas', 'Air', 'Lines', 'Aircraft', 'Transport', 'Cargo', 'Express', 'AirAsia', 'Transasia', 'Airway', 'Airways'],
    'Government': ['Government', 'Police', 'Border', 'State'],
    'Other': ['Historical', 'Research', 'Union', 'Society', 'Mission', 'Educational'],
    'Unknown': []  # Catch-all for unmatched operators
}

# Function to classify operators
def classify_operator(operator):
    for category, keywords in categories.items():
        if any(keyword.lower() in str(operator).lower() for keyword in keywords):
            return category
    return 'Unknown'

# Apply the function to classify operators
data['Category'] = data['Operator'].apply(classify_operator)

# Inspect the result
print(data[['Operator', 'Category']].head())

# Define the categories for the operators
operator_categories = ['Military', 'Private', 'Commercial', 'Government', 'Other', 'Unknown']

# Function to encode `Category` into a one-hot vector
def encode_operator_category(category):
    """Encode operator categories into a one-hot vector."""
    vector = [0] * len(operator_categories)  # Initialize vector with zeros
    if category in operator_categories:
        vector[operator_categories.index(category)] = 1  # Set 1 at the correct category index
    return vector


# Apply the function to encode `Category`
data['Category_Vector'] = data['Category'].apply(encode_operator_category)

# Inspect the result
print(data[['Category', 'Category_Vector']].head())


                 Operator  Category
0    Military - U.S. Army  Military
1                     NaN   Unknown
2    Military - U.S. Navy  Military
3                 Private   Private
4  Military - German Navy  Military
   Category     Category_Vector
0  Military  [1, 0, 0, 0, 0, 0]
1   Unknown  [0, 0, 0, 0, 0, 1]
2  Military  [1, 0, 0, 0, 0, 0]
3   Private  [0, 1, 0, 0, 0, 0]
4  Military  [1, 0, 0, 0, 0, 0]


## Step 5: Calculate Survives

**1. Add the Survives column as 1 if there were survivors, otherwise 0.**

In [196]:
# Calculate Survives
data['Survives'] = (data['Aboard'] - data['Fatalities'] > 0).astype(int)

# Inspect the result
print(data[['Aboard', 'Fatalities', 'Survives']].head())

   Aboard  Fatalities  Survives
0     2.0         1.0         1
1     1.0         1.0         0
2     5.0         5.0         0
3     1.0         1.0         0
4    20.0        14.0         1


##  Step 6: Simplify Summary

**1. Extract a single keyword from the Summary column based on predefined keywords.**

In [201]:
# Define detailed summary categories and their keywords
detailed_keywords = {
    'Visibility Issues': ['visibility', 'low-visibility', 'snowstorm', 'fog', 'cloud'],
    'Weather-Related': ['weather', 'storm', 'thunderstorm', 'snowstorm', 'lightning'],
    'Mechanical Problems': ['engine', 'fire', 'control', 'explosion', 'structural failure'],
    'Emergency Landing': ['emergency landing', 'land', 'ditched'],
    'Human Factors': ['training flight', 'test', 'demonstration'],
    'Unknown Causes': ['unknown reasons', 'unknown circumstances', 'cause unknown'],
    'First or Second War': ['shot down']
}

# Function to classify summaries into categories
def classify_summary(summary):
    if pd.isnull(summary):
        return 'Unknown Causes'
    summary = summary.lower()
    for category, keywords in detailed_keywords.items():
        if any(keyword in summary for keyword in keywords):
            return category
    return 'Other'

# Apply the function to classify summaries
data['Summary_Category'] = data['Summary'].apply(classify_summary)

# Inspect the result
print(data[['Summary', 'Summary_Category']].head(-40))

# List of all categories from the detailed_keywords dictionary
categories = list(detailed_keywords.keys())

# Function to encode `Summary_Category` into a one-hot vector
def encode_summary_to_vector(summary_category):
    """Encode summary categories into a one-hot vector."""
    vector = [0] * len(categories)  # Initialize vector with zeros
    if summary_category in categories:
        vector[categories.index(summary_category)] = 1  # Set 1 at the correct category index
    return vector

# Apply the function to create a `Summary_Vector` column
data['Summary_Vector'] = data['Summary_Category'].apply(encode_summary_to_vector)

# Inspect the result
print(data[['Summary_Category', 'Summary_Vector']].head())


                                                Summary     Summary_Category
0     During a demonstration flight, a U.S. Army fly...  Mechanical Problems
1     Eugene Lefebvre was the first pilot to ever be...  Mechanical Problems
2     First U.S. dirigible Akron exploded just offsh...        Human Factors
3     The first fatal airplane accident in Canada oc...      Weather-Related
4     The airship flew into a thunderstorm and encou...      Weather-Related
...                                                 ...                  ...
4953  Shortly after takeoff from runway 29, the left...  Mechanical Problems
4954  The vintage aircraft crashed onto Piz Segnas m...                Other
4955  The aircraft was approaching for a landing at ...    Emergency Landing
4956  The airliner crashed into the Jakarta Sea, 13 ...  Mechanical Problems
4957  After taking off and reaching FL200, the crew ...                Other

[4958 rows x 2 columns]
      Summary_Category         Summary_Vector
0  Me

## Step 7: Keep Required Columns

**1. Select only the necessary columns and include the Time_Vector.**

In [199]:
# Define the required columns, including expanded vector columns
columns_to_keep = [
    'Date',
    'Region_Vector',  # Simplified location (country only)
    'Aboard', 'Fatalities', 'Survives',  # Numeric columns    
    'Time_Vector', # Expanded Time_Vector columns    
    'Category_Vector', # Expanded Category columns   
    'Summary_Vector' # Expanded Summary_Category columns
]

# Filter the dataset to include only these columns
final_data = data[columns_to_keep]

# Inspect the final dataset
print(final_data.head())


        Date          Region_Vector  Aboard  Fatalities  Survives  \
0  9/17/1908  [1, 0, 0, 0, 0, 0, 0]     2.0         1.0         1   
1   9/7/1909  [0, 1, 0, 0, 0, 0, 0]     1.0         1.0         0   
2  7/12/1912  [1, 0, 0, 0, 0, 0, 0]     5.0         5.0         0   
3   8/6/1913  [1, 0, 0, 0, 0, 0, 0]     1.0         1.0         0   
4   9/9/1913  [0, 0, 1, 0, 0, 0, 0]    20.0        14.0         1   

       Time_Vector     Category_Vector         Summary_Vector  
0  [0, 0, 1, 0, 0]  [1, 0, 0, 0, 0, 0]  [0, 0, 1, 0, 0, 0, 0]  
1  [0, 0, 0, 0, 1]  [0, 0, 0, 0, 0, 1]  [0, 0, 1, 0, 0, 0, 0]  
2  [1, 0, 0, 0, 0]  [1, 0, 0, 0, 0, 0]  [0, 0, 0, 0, 1, 0, 0]  
3  [0, 0, 0, 0, 1]  [0, 1, 0, 0, 0, 0]  [0, 1, 0, 0, 0, 0, 0]  
4  [0, 0, 1, 0, 0]  [1, 0, 0, 0, 0, 0]  [0, 1, 0, 0, 0, 0, 0]  


## Step 8: Save to CSV

**1. Save the final prepared dataset to a new CSV file.**

In [200]:
# Save the final dataset to a new CSV file
output_file = '../data/learning_data.csv'  # Replace with the desired output path
final_data.to_csv(output_file, index=False)
print(f"Prepared data saved to {output_file}")

Prepared data saved to ../data/learning_data.csv


# Then we have done data preparation, we can begin with model training.
**You can find the model training in the file** ***model-training.ipynd***