In [1]:
import pandas as pd
pd.set_option('display.precision', 10)
import numpy as np
np.set_printoptions(precision=10)
df_crashes=pd.read_csv("Airplane_crashes.csv", delimiter=';')

In [2]:
# Adjusting date columns to the correct data format
df_crashes["Date"]=pd.to_datetime(df_crashes["Date"],dayfirst=True)
df_crashes["Time"]=df_crashes["Time"].str.extract(r'(\d{1,2}:\d{2})')[0]
df_crashes["Time"]=pd.to_datetime(df_crashes["Time"],format='%H:%M', errors='coerce').dt.time

# Selecting rows where the number of people aboard is greater than 0
df_crashes = df_crashes[df_crashes['Aboard'] > 0]

# Selecting rows where both Aboard and Fatalities are not null
df_crashes = df_crashes.dropna(subset=['Aboard', 'Fatalities'])

df_crashes.reset_index(drop=True)



Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Summary
0,1908-09-17,17:18:00,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1,2.0,1.0,0.0,"During a demonstration flight, a U.S. Army fly..."
1,1912-07-12,06:30:00,"AtlantiCity, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...
2,1913-08-06,NaT,"Victoria, British Columbia, Canada",Private,-,,Curtiss seaplane,,,1.0,1.0,0.0,The first fatal airplane accident in Canada oc...
3,1913-09-09,18:30:00,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,14.0,0.0,The airship flew into a thunderstorm and encou...
4,1913-10-17,10:30:00,"Near Johannisthal, Germany",Military - German Navy,,,Zeppelin L-2 (airship),,,30.0,30.0,0.0,Hydrogen gas which was being vented was sucked...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5239,2009-05-20,06:30:00,"Near Madiun, Indonesia",Military - Indonesian Air Force,,Jakarta - Maduin,Lockheed C-130 Hercules,A-1325,1982,112.0,98.0,2.0,"While on approach, the military transport cras..."
5240,2009-05-26,NaT,"Near Isiro, DemocratiRepubliCongo",Service Air,,Goma - Isiro,Antonov An-26,9Q-CSA,5005,4.0,4.0,,The cargo plane crashed while on approach to I...
5241,2009-06-01,00:15:00,"AtlantiOcean, 570 miles northeast of Natal, Br...",Air France,447,Rio de Janeiro - Paris,Airbus A330-203,F-GZCP,660,228.0,228.0,0.0,The Airbus went missing over the AtlantiOcean ...
5242,2009-06-07,08:30:00,"Near Port Hope Simpson, Newfoundland, Canada",Strait Air,,Lourdes de BlanSablon - Port Hope Simpson,Britten-Norman BN-2A-27 Islander,C-FJJR,424,1.0,1.0,0.0,The air ambulance crashed into hills while att...


In [3]:
# Adding additional date columns for further calculations
df_crashes["Month"] = df_crashes["Date"].dt.month
df_crashes["Year"] = df_crashes["Date"].dt.year
df_crashes['Weekday'] = df_crashes['Date'].dt.day_name() 

In [4]:
# Adding additional columns with fatality and survival rates
df_crashes["Fatality_Rate"] = df_crashes["Fatalities"] / df_crashes["Aboard"]
df_crashes["Survival_Rate"] = 1 - df_crashes["Fatality_Rate"]

In [5]:
# Adding a 'Category' column that defines the severity of an accident (based on survival rate quantiles)
q_low = df_crashes['Survival_Rate'].quantile(0.25)
q_high = df_crashes['Survival_Rate'].quantile(0.75)

df_crashes['Category'] = df_crashes['Survival_Rate'].apply(
    lambda x: 'High Survival' if x >= q_high else ('High Fatality' if x <= q_low else 'Moderate')
)


In [6]:
# Adding an 'Impact Category' column that defines the severity of an accident (based on survival rate and number of fatalities)
def classify_impact(row):
    if row['Fatalities'] >= 100 and row['Survival_Rate'] < 0.1:
        return 'Mass Fatality'
    elif row['Fatalities'] >= 30 and row['Survival_Rate'] < 0.2:
        return 'Severe Crash'
    elif row['Fatalities'] > 0 and row['Survival_Rate'] >= 0.8:
        return 'Minor Fatality'
    elif row['Fatalities'] == 0:
        return 'All Survived'
    else:
        return 'Moderate Impact'

df_crashes['Impact_Category'] = df_crashes.apply(classify_impact, axis=1)
df_crashes.reset_index(drop=True)

Unnamed: 0,Date,Time,Location,Operator,Flight #,Route,Type,Registration,cn/In,Aboard,Fatalities,Ground,Summary,Month,Year,Weekday,Fatality_Rate,Survival_Rate,Category,Impact_Category
0,1908-09-17,17:18:00,"Fort Myer, Virginia",Military - U.S. Army,,Demonstration,Wright Flyer III,,1,2.0,1.0,0.0,"During a demonstration flight, a U.S. Army fly...",9,1908,Thursday,0.500,0.500,High Survival,Moderate Impact
1,1912-07-12,06:30:00,"AtlantiCity, New Jersey",Military - U.S. Navy,,Test flight,Dirigible,,,5.0,5.0,0.0,First U.S. dirigible Akron exploded just offsh...,7,1912,Friday,1.000,0.000,High Fatality,Moderate Impact
2,1913-08-06,NaT,"Victoria, British Columbia, Canada",Private,-,,Curtiss seaplane,,,1.0,1.0,0.0,The first fatal airplane accident in Canada oc...,8,1913,Wednesday,1.000,0.000,High Fatality,Moderate Impact
3,1913-09-09,18:30:00,Over the North Sea,Military - German Navy,,,Zeppelin L-1 (airship),,,20.0,14.0,0.0,The airship flew into a thunderstorm and encou...,9,1913,Tuesday,0.700,0.300,High Survival,Moderate Impact
4,1913-10-17,10:30:00,"Near Johannisthal, Germany",Military - German Navy,,,Zeppelin L-2 (airship),,,30.0,30.0,0.0,Hydrogen gas which was being vented was sucked...,10,1913,Friday,1.000,0.000,High Fatality,Severe Crash
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5239,2009-05-20,06:30:00,"Near Madiun, Indonesia",Military - Indonesian Air Force,,Jakarta - Maduin,Lockheed C-130 Hercules,A-1325,1982,112.0,98.0,2.0,"While on approach, the military transport cras...",5,2009,Wednesday,0.875,0.125,Moderate,Severe Crash
5240,2009-05-26,NaT,"Near Isiro, DemocratiRepubliCongo",Service Air,,Goma - Isiro,Antonov An-26,9Q-CSA,5005,4.0,4.0,,The cargo plane crashed while on approach to I...,5,2009,Tuesday,1.000,0.000,High Fatality,Moderate Impact
5241,2009-06-01,00:15:00,"AtlantiOcean, 570 miles northeast of Natal, Br...",Air France,447,Rio de Janeiro - Paris,Airbus A330-203,F-GZCP,660,228.0,228.0,0.0,The Airbus went missing over the AtlantiOcean ...,6,2009,Monday,1.000,0.000,High Fatality,Mass Fatality
5242,2009-06-07,08:30:00,"Near Port Hope Simpson, Newfoundland, Canada",Strait Air,,Lourdes de BlanSablon - Port Hope Simpson,Britten-Norman BN-2A-27 Islander,C-FJJR,424,1.0,1.0,0.0,The air ambulance crashed into hills while att...,6,2009,Sunday,1.000,0.000,High Fatality,Moderate Impact


In [7]:
# Creating a new dataset with data aggregated by year of the accident
annual = df_crashes.groupby('Year').agg({
    'Date': 'count',  # number of accidents
    'Aboard': 'sum',
    'Fatalities': 'sum'
}).rename(columns={'Date': 'Num_Accidents'})
annual['Fatality_Rate'] = annual['Fatalities'] / annual['Aboard']
annual['Survival_Rate'] = 1 - annual['Fatality_Rate']


In [8]:
# Adding a new column 'Accidents_Rolling' with the 5-year rolling average
annual['Accidents_Rolling'] = annual['Num_Accidents'].rolling(window=5).mean()

# Adding a new column 'Change' with the year-to-year difference in the number of accidents
annual['Change'] = annual['Num_Accidents'].diff()

# Adding a new column 'Change_Z' with the Z-score of the 'Change' column (how far each value is from the mean, in terms of standard deviations)
annual['Change_Z'] = (annual['Change'] - annual['Change'].mean()) / annual['Change'].std()

# Adding of a new column Num_Accidents_Z with Z-score calculation of the column Num_Accidents (how far a value is from the mean, in terms of standard deviations)
annual['Num_Accidents_Z']=(annual['Num_Accidents'] - annual['Num_Accidents'].mean()) / annual['Num_Accidents'].std()

annual.reset_index()




Unnamed: 0,Year,Num_Accidents,Aboard,Fatalities,Fatality_Rate,Survival_Rate,Accidents_Rolling,Change,Change_Z,Num_Accidents_Z
0,1908,1,2.0,1.0,0.5000000000,0.5000000000,,,,-1.9177378140
1,1912,1,5.0,5.0,1.0000000000,0.0000000000,,0.0,-0.0207088356,-1.9177378140
2,1913,3,51.0,45.0,0.8823529412,0.1176470588,,2.0,0.1539656909,-1.8446953322
3,1915,2,60.0,40.0,0.6666666667,0.3333333333,,-1.0,-0.1080460989,-1.8812165731
4,1916,5,109.0,108.0,0.9908256881,0.0091743119,2.4,3.0,0.2413029542,-1.7716528503
...,...,...,...,...,...,...,...,...,...,...
93,2005,51,2164.0,1306.0,0.6035120148,0.3964879852,63.6,-10.0,-0.8940814683,-0.0916757680
94,2006,49,1413.0,1136.0,0.8039631989,0.1960368011,59.4,-2.0,-0.1953833622,-0.1647182499
95,2007,54,1364.0,931.0,0.6825513196,0.3174486804,55.2,5.0,0.4159774807,0.0178879547
96,2008,62,1463.0,820.0,0.5604921394,0.4395078606,55.4,8.0,0.6779892705,0.3100578821


In [9]:
df_crashes.to_csv('crashes.csv')
annual.to_csv('annual.csv')