# Import and Initialize

In [625]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
from fuzzywuzzy import fuzz

file_path = '/Users/haydnjones/Documents/GitHub/flight-crashes/Plane Crashes.csv'

df = pd.read_csv(file_path)
df = pd.DataFrame(df)

# Cleaning

In [628]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df.columns = pd.Series(df.columns).apply(lambda col: col.lower())
df.dropna(how='all', inplace= True)
df.fillna("unknown", inplace = True)
df.columns = df.columns.str.replace(' ', '_')
df['crew_on_board'] = df['crew_on_board'].replace('unknown', '0.0')
df['crew_on_board'] = pd.to_numeric(df['crew_on_board'], errors='coerce')

df['pax_on_board'] = df['pax_on_board'].replace('unknown', '0.0')
df['pax_on_board'] = pd.to_numeric(df['pax_on_board'], errors='coerce')

df['crew_fatalities'] = df['crew_fatalities'].replace('unknown', '0.0')
df['crew_fatalities'] = pd.to_numeric(df['crew_fatalities'], errors='coerce')

df['pax_fatalities'] = df['pax_fatalities'].replace('unknown', '0.0')
df['pax_fatalities'] = pd.to_numeric(df['pax_fatalities'], errors='coerce')

df['other_fatalities'] = df['other_fatalities'].replace('unknown', '0.0')
df['other_fatalities'] = pd.to_numeric(df['other_fatalities'], errors='coerce')

df['total_on_board'] = df['crew_on_board'] + df['pax_on_board']
df['total_fatalities'] = df['crew_fatalities'] + df['pax_fatalities'] + df['other_fatalities']

df_cleaned = df.drop(['crew_on_board', 'crew_fatalities', 'pax_on_board', 'pax_fatalities', 'other_fatalities'], axis=1)

df_cleaned = df.drop(['msn', 'yom', 'flight_no.'], axis=1)

order = [
    "date", "time", "aircraft", "operator", "flight_phase",
    "flight_type", "crash_site", "schedule", "crash_location",
    "country", "region", "circumstances", "crash_cause",
    "total_on_board", "total_fatalities", "survivors"
]

df_cleaned = df_cleaned[order]

In [630]:
passenger_types = [
    "scheduled revenue flight",
    "charter/taxi (non scheduled revenue flight)",
    "private",
    "executive/corporate/business",
    "ferry"
    "cargo",
    "positioning",
    "ambulance",
    "topographic",
    "geographical / geophysical / scientific",
    "illegal (smuggling)",
    "spraying (agricultural)",
    "humanitarian",
    "meteorological / weather",
    "fire fighting"
]

df_cleaned['flight_type'] = df_cleaned['flight_type'].apply(lambda x: 'passenger flights' if x in passenger_types else None)

df_cleaned = df_cleaned.dropna(subset=['flight_type'])  

In [632]:
threshold_year = 1950

df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])
df_cleaned = df_cleaned[df_cleaned['date'].dt.year >= threshold_year]

In [634]:
df_cleaned

Unnamed: 0,date,time,aircraft,operator,flight_phase,flight_type,crash_site,schedule,crash_location,country,region,circumstances,crash_cause,total_on_board,total_fatalities,survivors
9071,1950-01-03,unknown,avro 652 anson,united air services - tanzania,landing (descent or approach),passenger flights,"plain, valley",unknown,kazimzumbwi pwani region,tanzania,africa,"while approaching dar es-salaam, the twin engi...",unknown,2.0,2.0,no
9075,1950-01-07,unknown,boeing 247,lineas aéreas guerrero oaxaca - lagosa,takeoff (climb),passenger flights,"plain, valley",unknown,mexico city federal district of mexico city,mexico,central america,"few minutes after takeoff from mexico city, th...",unknown,0.0,0.0,unknown
9081,1950-01-14,unknown,de havilland dh.89 dragon rapide,new zealand national airways,parking,passenger flights,airport (less than 10 km from airport),rotorua – hamilton,rotorua bay of plenty regional council,new zealand,oceania,the aircraft was parked at rotorua airport and...,technical failure,5.0,0.0,yes
9082,1950-01-18,unknown,douglas dc-3,trans asiatic airlines - taa,takeoff (climb),passenger flights,airport (less than 10 km from airport),unknown,yangon yangon region,myanmar,asia,an undercarriage failed during takeoff roll. t...,technical failure,0.0,0.0,yes
9086,1950-01-22,unknown,de havilland dh.104 dove,central african airways - caa,landing (descent or approach),passenger flights,airport (less than 10 km from airport),unknown,livingstone southern,zambia,africa,crashed on final approach to livingstone airpo...,unknown,0.0,0.0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28529,2022-05-11,unknown,de havilland dhc-6 twin otter,caverton helicopters,flight,passenger flights,"plain, valley",yaoundé – dompta – belabo,nanga eboko centre,cameroon,africa,the twin engine airplane departed yaoundé at 1...,unknown,11.0,11.0,no
28530,2022-05-12,8h 4m 0s,airbus a319,tibet airlines,takeoff (climb),passenger flights,airport (less than 10 km from airport),chongqing – nyingchi,chongqing-jiangbei sichuan,china,asia,the airplane was departing chongqing-jiangbei ...,unknown,122.0,0.0,yes
28532,2022-05-23,18h 29m 0s,piper pa-61 aerostar (ted smith 601),raul ignacion posada,flight,passenger flights,"plain, valley",celaya - durango,durango durango,mexico,central america,while approaching durango airport on a flight ...,unknown,2.0,2.0,no
28533,2022-05-24,15h 40m 0s,de havilland dhc-3 otter,yakutat coastal airlines,landing (descent or approach),passenger flights,airport (less than 10 km from airport),yakutat – dry bay,dry bay alaska,united states of america,north america,the single engine airplane departed yakutat on...,unknown,4.0,0.0,yes
