# Import and Initialize

In [45]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
from fuzzywuzzy import fuzz

file_path = "/Users/peterwilliams/Downloads/PlaneCrashes/PlaneCrashes.csv"

df = pd.read_csv(file_path)
df = pd.DataFrame(df)

# Cleaning

In [47]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df.columns = pd.Series(df.columns).apply(lambda col: col.lower())
df.dropna(how='all', inplace= True)
df.fillna("unknown", inplace = True)
df.columns = df.columns.str.replace(' ', '_')
df['crew_on_board'] = df['crew_on_board'].replace('unknown', '0.0')
df['crew_on_board'] = pd.to_numeric(df['crew_on_board'], errors='coerce')

df['pax_on_board'] = df['pax_on_board'].replace('unknown', '0.0')
df['pax_on_board'] = pd.to_numeric(df['pax_on_board'], errors='coerce')

df['crew_fatalities'] = df['crew_fatalities'].replace('unknown', '0.0')
df['crew_fatalities'] = pd.to_numeric(df['crew_fatalities'], errors='coerce')

df['pax_fatalities'] = df['pax_fatalities'].replace('unknown', '0.0')
df['pax_fatalities'] = pd.to_numeric(df['pax_fatalities'], errors='coerce')

df['other_fatalities'] = df['other_fatalities'].replace('unknown', '0.0')
df['other_fatalities'] = pd.to_numeric(df['other_fatalities'], errors='coerce')

In [48]:
df.drop(['msn',
         'yom',
         'flight_no.', 
        ], axis=1, inplace=True)

In [49]:
df['total_on_board'] = df['crew_on_board'] + df['pax_on_board']
df['total_fatalities'] = df['crew_fatalities'] + df['pax_fatalities'] + df['other_fatalities']

In [50]:
df_cleaned = df.drop(['crew_on_board', 'crew_fatalities', 'pax_on_board', 'pax_fatalities', 'other_fatalities'], axis=1)

In [51]:
df_cleaned

Unnamed: 0,date,time,aircraft,operator,registration,flight_phase,flight_type,survivors,crash_site,schedule,crash_location,country,region,total_fatalities,circumstances,crash_cause,total_on_board
0,1918-05-02,unknown,de havilland dh.4,united states signal corps - ussc,as-32084,takeoff (climb),test,no,airport (less than 10 km from airport),dayton - dayton,dayton-mccook field ohio,united states of america,north america,2.0,the single engine airplane departed dayton-mcc...,technical failure,2.0
1,1918-06-08,unknown,handley page v/1500,handley page aircraft company ltd,e4104,takeoff (climb),test,yes,airport (less than 10 km from airport),cricklewood - cricklewood,cricklewood london metropolis,united kingdom,europe,5.0,"assembled at cricklewood airfield in may 1918,...",technical failure,6.0
2,1918-06-11,unknown,avro 504,royal air force - raf,a8544,flight,training,yes,"plain, valley",abukir - abukir,abukir (abu qir) alexandria,egypt,africa,1.0,the single engine aircraft was completing a lo...,unknown,2.0
3,1918-06-19,unknown,de havilland dh.4,united states signal corps - ussc,as-32098,flight,military,no,airport (less than 10 km from airport),wright patterson afb-wright patterson afb,wright-patterson afb (dayton) ohio,united states of america,north america,1.0,"lt. frank stuart patterson, son and nephew of ...",technical failure,1.0
4,1918-06-24,unknown,breguet 14,french air force - armée de l'air,as-4130,landing (descent or approach),military,yes,unknown,unknown,france all france,france,europe,0.0,the aircraft crashed iupon landing somewhere i...,unknown,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28531,2022-05-20,unknown,embraer emb-110 bandeirante,sales serviços aéreos,pt-shn,landing (descent or approach),cargo,yes,airport (less than 10 km from airport),jundiaí – eldorado do sul,eldorado do sul rio grande do sul,brazil,south america,0.0,the airplane departed jundiaí on a cargo fligh...,unknown,2.0
28532,2022-05-23,18h 29m 0s,piper pa-61 aerostar (ted smith 601),raul ignacion posada,n66cg,flight,private,no,"plain, valley",celaya - durango,durango durango,mexico,central america,2.0,while approaching durango airport on a flight ...,unknown,2.0
28533,2022-05-24,15h 40m 0s,de havilland dhc-3 otter,yakutat coastal airlines,n703th,landing (descent or approach),charter/taxi (non scheduled revenue flight),yes,airport (less than 10 km from airport),yakutat – dry bay,dry bay alaska,united states of america,north america,0.0,the single engine airplane departed yakutat on...,unknown,4.0
28534,2022-05-29,10h 7m 0s,de havilland dhc-6 twin otter,tara air,9n-aet,flight,scheduled revenue flight,no,mountains,pokhara – jomsom,shikha dhawalagiri,nepal,asia,22.0,the twin engine airplane departed pokhara city...,human factor,22.0


In [52]:
order = [
    "date", "time", "aircraft", "operator", "flight_phase",
    "flight_type", "crash_site", "schedule", "crash_location",
    "country", "region", "circumstances", "crash_cause",
    "total_on_board", "total_fatalities", "survivors"
]

In [53]:
df_cleaned = df_cleaned[order]

In [54]:
df_cleaned

Unnamed: 0,date,time,aircraft,operator,flight_phase,flight_type,crash_site,schedule,crash_location,country,region,circumstances,crash_cause,total_on_board,total_fatalities,survivors
0,1918-05-02,unknown,de havilland dh.4,united states signal corps - ussc,takeoff (climb),test,airport (less than 10 km from airport),dayton - dayton,dayton-mccook field ohio,united states of america,north america,the single engine airplane departed dayton-mcc...,technical failure,2.0,2.0,no
1,1918-06-08,unknown,handley page v/1500,handley page aircraft company ltd,takeoff (climb),test,airport (less than 10 km from airport),cricklewood - cricklewood,cricklewood london metropolis,united kingdom,europe,"assembled at cricklewood airfield in may 1918,...",technical failure,6.0,5.0,yes
2,1918-06-11,unknown,avro 504,royal air force - raf,flight,training,"plain, valley",abukir - abukir,abukir (abu qir) alexandria,egypt,africa,the single engine aircraft was completing a lo...,unknown,2.0,1.0,yes
3,1918-06-19,unknown,de havilland dh.4,united states signal corps - ussc,flight,military,airport (less than 10 km from airport),wright patterson afb-wright patterson afb,wright-patterson afb (dayton) ohio,united states of america,north america,"lt. frank stuart patterson, son and nephew of ...",technical failure,1.0,1.0,no
4,1918-06-24,unknown,breguet 14,french air force - armée de l'air,landing (descent or approach),military,unknown,unknown,france all france,france,europe,the aircraft crashed iupon landing somewhere i...,unknown,0.0,0.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28531,2022-05-20,unknown,embraer emb-110 bandeirante,sales serviços aéreos,landing (descent or approach),cargo,airport (less than 10 km from airport),jundiaí – eldorado do sul,eldorado do sul rio grande do sul,brazil,south america,the airplane departed jundiaí on a cargo fligh...,unknown,2.0,0.0,yes
28532,2022-05-23,18h 29m 0s,piper pa-61 aerostar (ted smith 601),raul ignacion posada,flight,private,"plain, valley",celaya - durango,durango durango,mexico,central america,while approaching durango airport on a flight ...,unknown,2.0,2.0,no
28533,2022-05-24,15h 40m 0s,de havilland dhc-3 otter,yakutat coastal airlines,landing (descent or approach),charter/taxi (non scheduled revenue flight),airport (less than 10 km from airport),yakutat – dry bay,dry bay alaska,united states of america,north america,the single engine airplane departed yakutat on...,unknown,4.0,0.0,yes
28534,2022-05-29,10h 7m 0s,de havilland dhc-6 twin otter,tara air,flight,scheduled revenue flight,mountains,pokhara – jomsom,shikha dhawalagiri,nepal,asia,the twin engine airplane departed pokhara city...,human factor,22.0,22.0,no


# Certain routes have a higher risk factor of fatalities

In [56]:
df_cleaned

Unnamed: 0,date,time,aircraft,operator,flight_phase,flight_type,crash_site,schedule,crash_location,country,region,circumstances,crash_cause,total_on_board,total_fatalities,survivors
0,1918-05-02,unknown,de havilland dh.4,united states signal corps - ussc,takeoff (climb),test,airport (less than 10 km from airport),dayton - dayton,dayton-mccook field ohio,united states of america,north america,the single engine airplane departed dayton-mcc...,technical failure,2.0,2.0,no
1,1918-06-08,unknown,handley page v/1500,handley page aircraft company ltd,takeoff (climb),test,airport (less than 10 km from airport),cricklewood - cricklewood,cricklewood london metropolis,united kingdom,europe,"assembled at cricklewood airfield in may 1918,...",technical failure,6.0,5.0,yes
2,1918-06-11,unknown,avro 504,royal air force - raf,flight,training,"plain, valley",abukir - abukir,abukir (abu qir) alexandria,egypt,africa,the single engine aircraft was completing a lo...,unknown,2.0,1.0,yes
3,1918-06-19,unknown,de havilland dh.4,united states signal corps - ussc,flight,military,airport (less than 10 km from airport),wright patterson afb-wright patterson afb,wright-patterson afb (dayton) ohio,united states of america,north america,"lt. frank stuart patterson, son and nephew of ...",technical failure,1.0,1.0,no
4,1918-06-24,unknown,breguet 14,french air force - armée de l'air,landing (descent or approach),military,unknown,unknown,france all france,france,europe,the aircraft crashed iupon landing somewhere i...,unknown,0.0,0.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28531,2022-05-20,unknown,embraer emb-110 bandeirante,sales serviços aéreos,landing (descent or approach),cargo,airport (less than 10 km from airport),jundiaí – eldorado do sul,eldorado do sul rio grande do sul,brazil,south america,the airplane departed jundiaí on a cargo fligh...,unknown,2.0,0.0,yes
28532,2022-05-23,18h 29m 0s,piper pa-61 aerostar (ted smith 601),raul ignacion posada,flight,private,"plain, valley",celaya - durango,durango durango,mexico,central america,while approaching durango airport on a flight ...,unknown,2.0,2.0,no
28533,2022-05-24,15h 40m 0s,de havilland dhc-3 otter,yakutat coastal airlines,landing (descent or approach),charter/taxi (non scheduled revenue flight),airport (less than 10 km from airport),yakutat – dry bay,dry bay alaska,united states of america,north america,the single engine airplane departed yakutat on...,unknown,4.0,0.0,yes
28534,2022-05-29,10h 7m 0s,de havilland dhc-6 twin otter,tara air,flight,scheduled revenue flight,mountains,pokhara – jomsom,shikha dhawalagiri,nepal,asia,the twin engine airplane departed pokhara city...,human factor,22.0,22.0,no


In [78]:
df_cleaned["flight_type"].unique()

array(['test', 'training', 'military', 'delivery',
       'survey / patrol / reconnaissance', 'postal (mail)', 'ferry',
       'aerial photography', 'scheduled revenue flight', 'government',
       'private', 'charter/taxi (non scheduled revenue flight)',
       'unknown', 'cargo', 'positioning', 'demonstration', 'ambulance',
       'executive/corporate/business', 'topographic',
       'geographical / geophysical / scientific', 'cinematography',
       'illegal (smuggling)', 'spraying (agricultural)', 'refuelling',
       'humanitarian', 'bombing', 'supply', 'calibration',
       'meteorological / weather', 'fire fighting',
       'skydiving / paratroopers', 'aerobatic'], dtype=object)

In [80]:
df_cleaned["schedule"].nunique()

15657

In [None]:
df_sorted = df.sort_values(by='Score', ascending=False)

print(df_sorted)

In [None]:
count number of roputes