# Import and Initialize

In [21]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
from fuzzywuzzy import fuzz

file_path = '/Users/haydnjones/Documents/GitHub/flight-crashes/Plane Crashes.csv'

df = pd.read_csv(file_path)
df = pd.DataFrame(df)

# Cleaning

In [24]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df.columns = pd.Series(df.columns).apply(lambda col: col.lower())
df.dropna(how='all', inplace= True)
df.fillna("unknown", inplace = True)
df.columns = df.columns.str.replace(' ', '_')
df['crew_on_board'] = df['crew_on_board'].replace('unknown', '0.0')
df['crew_on_board'] = pd.to_numeric(df['crew_on_board'], errors='coerce')

df['pax_on_board'] = df['pax_on_board'].replace('unknown', '0.0')
df['pax_on_board'] = pd.to_numeric(df['pax_on_board'], errors='coerce')

df['crew_fatalities'] = df['crew_fatalities'].replace('unknown', '0.0')
df['crew_fatalities'] = pd.to_numeric(df['crew_fatalities'], errors='coerce')

df['pax_fatalities'] = df['pax_fatalities'].replace('unknown', '0.0')
df['pax_fatalities'] = pd.to_numeric(df['pax_fatalities'], errors='coerce')

df['other_fatalities'] = df['other_fatalities'].replace('unknown', '0.0')
df['other_fatalities'] = pd.to_numeric(df['other_fatalities'], errors='coerce')

df['total_on_board'] = df['crew_on_board'] + df['pax_on_board']
df['total_fatalities'] = df['crew_fatalities'] + df['pax_fatalities'] + df['other_fatalities']

df_cleaned = df.drop(['crew_on_board', 'crew_fatalities', 'pax_on_board', 'pax_fatalities', 'other_fatalities'], axis=1)

df.drop(['msn',
         'yom',
         'flight_no.', 
        ], axis=1, inplace=True)

order = [
    "date", "time", "aircraft", "operator", "flight_phase",
    "flight_type", "crash_site", "schedule", "crash_location",
    "country", "region", "circumstances", "crash_cause",
    "total_on_board", "total_fatalities", "survivors"
]

df_cleaned = df_cleaned[order]

In [26]:
passenger_types = [
    "scheduled revenue flight",
    "charter/taxi (non scheduled revenue flight)",
    "private",
    "executive/corporate/business",
    "ferry"
    "cargo",
    "positioning",
    "ambulance",
    "topographic",
    "geographical / geophysical / scientific",
    "illegal (smuggling)",
    "spraying (agricultural)",
    "humanitarian",
    "meteorological / weather",
    "fire fighting"
]

df_cleaned['flight_type'] = df_cleaned['flight_type'].apply(lambda x: 'passenger flights' if x in passenger_types else None)

df_cleaned = df_cleaned.dropna(subset=['flight_type'])

In [28]:
df_cleaned

Unnamed: 0,date,time,aircraft,operator,flight_phase,flight_type,crash_site,schedule,crash_location,country,region,circumstances,crash_cause,total_on_board,total_fatalities,survivors
31,1919-03-23,unknown,unnamed aircraft,private french,flight,passenger flights,"plain, valley",unknown,bordeaux gironde,france,europe,"in flight, the crew encountered engine problem...",technical failure,2.0,0.0,yes
32,1919-04-07,unknown,unnamed aircraft,private french,landing (descent or approach),passenger flights,"plain, valley",unknown,france all france,france,europe,crashed in unknow circumstances somewhere in f...,technical failure,0.0,0.0,yes
45,1919-05-19,12h 0m 0s,farman f.60 goliath,private french,landing (descent or approach),passenger flights,"plain, valley",unknown,belgium all belgium,belgium,europe,unknown,unknown,0.0,0.0,unknown
49,1919-05-31,unknown,blackburn r.t.1 kangaroo,grahame-white aviation,takeoff (climb),passenger flights,airport (less than 10 km from airport),unknown,hendon middlesex,united kingdom,europe,"shortly after takeoff, during initial climb, t...",technical failure,3.0,0.0,yes
51,1919-06-15,8h 40m 0s,vickers fb.27 vimy commercial,vickers-armstrongs ltd,landing (descent or approach),passenger flights,"plain, valley",saint john's - clifden,clifden connacht,ireland,europe,british aviators john alcock and arthur whitte...,unknown,2.0,0.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28529,2022-05-11,unknown,de havilland dhc-6 twin otter,caverton helicopters,flight,passenger flights,"plain, valley",yaoundé – dompta – belabo,nanga eboko centre,cameroon,africa,the twin engine airplane departed yaoundé at 1...,unknown,11.0,11.0,no
28530,2022-05-12,8h 4m 0s,airbus a319,tibet airlines,takeoff (climb),passenger flights,airport (less than 10 km from airport),chongqing – nyingchi,chongqing-jiangbei sichuan,china,asia,the airplane was departing chongqing-jiangbei ...,unknown,122.0,0.0,yes
28532,2022-05-23,18h 29m 0s,piper pa-61 aerostar (ted smith 601),raul ignacion posada,flight,passenger flights,"plain, valley",celaya - durango,durango durango,mexico,central america,while approaching durango airport on a flight ...,unknown,2.0,2.0,no
28533,2022-05-24,15h 40m 0s,de havilland dhc-3 otter,yakutat coastal airlines,landing (descent or approach),passenger flights,airport (less than 10 km from airport),yakutat – dry bay,dry bay alaska,united states of america,north america,the single engine airplane departed yakutat on...,unknown,4.0,0.0,yes


# Weather Conditions Hypothesis

In [30]:
df_weather = df_cleaned

In [32]:
df_weather

Unnamed: 0,date,time,aircraft,operator,flight_phase,flight_type,crash_site,schedule,crash_location,country,region,circumstances,crash_cause,total_on_board,total_fatalities,survivors
31,1919-03-23,unknown,unnamed aircraft,private french,flight,passenger flights,"plain, valley",unknown,bordeaux gironde,france,europe,"in flight, the crew encountered engine problem...",technical failure,2.0,0.0,yes
32,1919-04-07,unknown,unnamed aircraft,private french,landing (descent or approach),passenger flights,"plain, valley",unknown,france all france,france,europe,crashed in unknow circumstances somewhere in f...,technical failure,0.0,0.0,yes
45,1919-05-19,12h 0m 0s,farman f.60 goliath,private french,landing (descent or approach),passenger flights,"plain, valley",unknown,belgium all belgium,belgium,europe,unknown,unknown,0.0,0.0,unknown
49,1919-05-31,unknown,blackburn r.t.1 kangaroo,grahame-white aviation,takeoff (climb),passenger flights,airport (less than 10 km from airport),unknown,hendon middlesex,united kingdom,europe,"shortly after takeoff, during initial climb, t...",technical failure,3.0,0.0,yes
51,1919-06-15,8h 40m 0s,vickers fb.27 vimy commercial,vickers-armstrongs ltd,landing (descent or approach),passenger flights,"plain, valley",saint john's - clifden,clifden connacht,ireland,europe,british aviators john alcock and arthur whitte...,unknown,2.0,0.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28529,2022-05-11,unknown,de havilland dhc-6 twin otter,caverton helicopters,flight,passenger flights,"plain, valley",yaoundé – dompta – belabo,nanga eboko centre,cameroon,africa,the twin engine airplane departed yaoundé at 1...,unknown,11.0,11.0,no
28530,2022-05-12,8h 4m 0s,airbus a319,tibet airlines,takeoff (climb),passenger flights,airport (less than 10 km from airport),chongqing – nyingchi,chongqing-jiangbei sichuan,china,asia,the airplane was departing chongqing-jiangbei ...,unknown,122.0,0.0,yes
28532,2022-05-23,18h 29m 0s,piper pa-61 aerostar (ted smith 601),raul ignacion posada,flight,passenger flights,"plain, valley",celaya - durango,durango durango,mexico,central america,while approaching durango airport on a flight ...,unknown,2.0,2.0,no
28533,2022-05-24,15h 40m 0s,de havilland dhc-3 otter,yakutat coastal airlines,landing (descent or approach),passenger flights,airport (less than 10 km from airport),yakutat – dry bay,dry bay alaska,united states of america,north america,the single engine airplane departed yakutat on...,unknown,4.0,0.0,yes
