In [219]:
import pandas as pd
pd.set_option('display.precision', 10)
import numpy as np
np.set_printoptions(precision=10)
df_crashes=pd.read_csv("Airplane_crashes.csv", delimiter=';')

In [None]:
# adjusting date columns to correct data format
df_crashes["Date"]=pd.to_datetime(df_crashes["Date"],dayfirst=True)
df_crashes["Time"]=df_crashes["Time"].str.extract(r'(\d{1,2}:\d{2})')[0]
df_crashes["Time"]=pd.to_datetime(df_crashes["Time"],format='%H:%M', errors='coerce').dt.time

# selecting rows where num of Aboard>0 
df_crashes = df_crashes[df_crashes['Aboard'] > 0]
# selecting rows where Aboard and Fatalities is not null
df_crashes = df_crashes.dropna(subset=['Aboard', 'Fatalities'])
df_crashes.reset_index(drop=True)



In [221]:
# Adding of additonal date columns a for further calculation
df_crashes["Month"] = df_crashes["Date"].dt.month
df_crashes["Year"] = df_crashes["Date"].dt.year
df_crashes['Weekday'] = df_crashes['Date'].dt.day_name() 

In [222]:
# Adding of additonal columns with fatality and survival rate
df_crashes["Fatality_Rate"] = df_crashes["Fatalities"] / df_crashes["Aboard"]
df_crashes["Survival_Rate"] = 1 - df_crashes["Fatality_Rate"]

In [223]:
# Adding of column Category that defines a severity of an accident (Calculation based on quantiles of survival rate)
q_low = df_crashes['Survival_Rate'].quantile(0.25)
q_high = df_crashes['Survival_Rate'].quantile(0.75)

df_crashes['Category'] = df_crashes['Survival_Rate'].apply(
    lambda x: 'High Survival' if x >= q_high else ('High Fatality' if x <= q_low else 'Moderate')
)


In [None]:
# Adding of a column Impact Category that defines a severity of an accident (Calculation based on of survival rate and a number of fatalities)
def classify_impact(row):
    if row['Fatalities'] >= 100 and row['Survival_Rate'] < 0.1:
        return 'Mass Fatality'
    elif row['Fatalities'] >= 30 and row['Survival_Rate'] < 0.2:
        return 'Severe Crash'
    elif row['Fatalities'] > 0 and row['Survival_Rate'] >= 0.8:
        return 'Minor Fatality'
    elif row['Fatalities'] == 0:
        return 'All Survived'
    else:
        return 'Moderate Impact'

df_crashes['Impact_Category'] = df_crashes.apply(classify_impact, axis=1)
df_crashes.reset_index(drop=True)

In [225]:
# Creation of of a new dataset with aggregated data by Year of an accident
annual = df_crashes.groupby('Year').agg({
    'Date': 'count',  # number of accidents
    'Aboard': 'sum',
    'Fatalities': 'sum'
}).rename(columns={'Date': 'Num_Accidents'})
annual['Fatality_Rate'] = annual['Fatalities'] / annual['Aboard']
annual['Survival_Rate'] = 1 - annual['Fatality_Rate']


In [None]:
# Adding of a new column Accidents_Rolling with calculated 5-year rolling average
annual['Accidents_Rolling'] = annual['Num_Accidents'].rolling(window=5).mean()

# Adding of a new column Change with year-to-year difference in the number of accidents
annual['Change'] = annual['Num_Accidents'].diff()

# Adding of a new column Change_Z with Z-score calculation of the column Change (how far a value is from the mean, in terms of standard deviations)
annual['Change_Z'] = (annual['Change'] - annual['Change'].mean()) / annual['Change'].std()

# Adding of a new column Num_Accidents_Z with Z-score calculation of the column Num_Accidents (how far a value is from the mean, in terms of standard deviations)
annual['Num_Accidents_Z']=(annual['Num_Accidents'] - annual['Num_Accidents'].mean()) / annual['Num_Accidents'].std()

annual.reset_index()




In [227]:
df_crashes.to_csv('crashes.csv')
annual.to_csv('annual.csv')