Source is https://www.planecrashinfo.com/index.html

In [3]:
import os
import sys
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [31]:
def get_html_source(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None
    
def count_word_appearance(text, word):
    if text is None: return 0
    else: return text.count(word)

def get_attribute(html):
    if html is None: return None
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')
    data = {}
    for row in rows:
        cols = row.find_all("td")
        cols = [ele.text.strip() for ele in cols]
        if cols[0] != "-":
            cols[0] = cols[0].replace("\n", "")
            cols[0] = cols[0].replace(" ", "")
            cols[0].strip(":")
            data[cols[0]] = cols[1]
    return data

# Get dataset size

In [16]:
num_rows = 0
for year in range(1920, 2024):
    url = f"https://www.planecrashinfo.com/{year}/{year}.htm"
    source = get_html_source(url)
    num_rows += count_word_appearance(source, "DCDCDC") + count_word_appearance(source, "WHITE")

# Get data

In [32]:
data = []
for year in range(1920, 2024):
    print(f"Start scraping year {year}")
    url = f"https://www.planecrashinfo.com/{year}/{year}.htm"
    source = get_html_source(url)
    num_rows = count_word_appearance(source, "DCDCDC") + count_word_appearance(source, "WHITE")
    for idx in range(1,num_rows+1):
        url = f"https://www.planecrashinfo.com/{year}/{year}-{idx}.htm"
        source = get_html_source(url)
        data.append(get_attribute(source))

In [61]:
df = pd.DataFrame(data)
# df.to_csv("data.csv", index=False)
df.columns

Index(['Date:', 'Time:', 'Location:', 'Operator:', 'Flight#:', 'Route:',
       'ACType:', 'Registration:', 'cn/ln:', 'Aboard:', 'Fatalities:',
       'Ground:', 'Summary:'],
      dtype='object')

In [108]:
df[["Date:", "ACType:", "Registration:"]].drop_duplicates().shape

(5033, 3)

In [109]:
df_group = df.groupby(["Date:","ACType:","Registration:"])
for name, group in df_group:
    if group.shape[0] > 1:
        print(group)

                Date: Time:             Location:    Operator: Flight#:  \
142  January 22, 1928     ?  L'Hospitalet, Ariège  Aeropostale        ?   
143  January 22, 1928     ?      Tarragona, Spain  Aeropostale        ?   

                                                Route:     ACType:  \
142  Toulouse - Perpignan - Alicante - Casablanca -...  Breguet 14   
143       Casablanca - Alicante - Perpignan - Toulouse  Breguet 14   

    Registration: cn/ln:                     Aboard:  \
142        F-AHEQ      ?  1   (passengers:0  crew:1)   
143        F-AHEQ    240  2   (passengers:0  crew:2)   

                    Fatalities: Ground:  \
142  1   (passengers:0  crew:1)       0   
143  2   (passengers:0  crew:2)       0   

                                              Summary:  
142  Crashed after experiencing mechanical difficul...  
143                  Crashed in unknown circumstances.  


In [97]:
from itertools import combinations

def all_subsets(lst):
    return [list(subset) for i in range(1, len(lst)+1) for subset in combinations(lst, i)]

for col in all_subsets(df.columns):
    s = df[list(col)].drop_duplicates().shape
    if s[0] == 5034:
        print(col, df[col].drop_duplicates().shape)

['Date:', 'Location:', 'Fatalities:'] (5034, 3)
['Date:', 'Location:', 'Summary:'] (5034, 3)
['Location:', 'Registration:', 'Summary:'] (5034, 3)
['Location:', 'Aboard:', 'Summary:'] (5034, 3)
['Location:', 'Fatalities:', 'Summary:'] (5034, 3)
['Date:', 'Time:', 'Location:', 'ACType:'] (5034, 4)
['Date:', 'Time:', 'Location:', 'Registration:'] (5034, 4)
['Date:', 'Time:', 'Location:', 'Fatalities:'] (5034, 4)
['Date:', 'Time:', 'Location:', 'Summary:'] (5034, 4)
['Date:', 'Time:', 'Operator:', 'Fatalities:'] (5034, 4)
['Date:', 'Time:', 'Route:', 'ACType:'] (5034, 4)
['Date:', 'Time:', 'ACType:', 'cn/ln:'] (5034, 4)
['Date:', 'Time:', 'ACType:', 'Aboard:'] (5034, 4)
['Date:', 'Time:', 'ACType:', 'Fatalities:'] (5034, 4)
['Date:', 'Time:', 'ACType:', 'Summary:'] (5034, 4)
['Date:', 'Time:', 'Registration:', 'Aboard:'] (5034, 4)
['Date:', 'Time:', 'Aboard:', 'Summary:'] (5034, 4)
['Date:', 'Time:', 'Fatalities:', 'Summary:'] (5034, 4)
['Date:', 'Location:', 'Operator:', 'ACType:'] (5034,

# Geo Json

In [1]:
import json
import pandas as pd
import numpy as np
import os
import sys
import requests

np.set_printoptions(threshold=sys.maxsize)

In [2]:
data = pd.read_csv("../../dataset/processed_data.csv")
identification_tag = pd.read_csv("../../dataset/country.csv")

In [39]:
data.iloc[:2706,0] = data.iloc[:2706, 0] - 100

In [41]:
data.iloc[:2706]

Unnamed: 0,year,month,day,country,state,city,hour,minute,operator,flight,...,cn_ln,ground,summary,route,totalAboard,passengerAboard,crewAboard,totalFatality,passengerFatality,crewFatality
0,1908,9,17,United States of America,Virginia,Fort Myer,17,18,Military - U.S. Army,?,...,1,0,"During a demonstration flight, a U.S. Army fly...",,2,1,1,1,1,0
1,1909,9,7,France,?,Juvisy-sur-Orge,0,0,?,?,...,?,0,Eugene Lefebvre was the first pilot to ever be...,,1,0,1,1,0,0
2,1912,7,12,United States of America,New Jersey,Atlantic City,6,30,Military - U.S. Navy,?,...,?,0,First U.S. dirigible Akron exploded just offsh...,,5,0,5,5,0,5
3,1913,8,6,Canada,British Columbia,Victoria,0,0,Private,?,...,?,0,The first fatal airplane accident in Canada oc...,,1,0,1,1,0,1
4,1913,9,9,?,?,?,18,30,Military - German Navy,?,...,?,0,The airship flew into a thunderstorm and encou...,,20,0,0,14,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2701,1973,12,17,Italy,?,Rome,13,0,Pan American World Airways,110,...,18838/412,0,Two phosphorus bombs were thrown into the airc...,"Rome,Beirut",177,167,10,30,29,1
2702,1973,12,21,Vietnam,?,Siagon,0,0,Military - Vietnamese Air Force,?,...,?,0,Ammunition exploded while the aircraft was on ...,,20,0,0,20,0,0
2703,1973,12,22,Morocco,?,Tetouan,22,10,Royal Air Maroc,?,...,069/84,0,"After missing the outer marker, the aircraft c...","Paris,Tanger",106,99,7,106,99,7
2704,1973,12,23,Ukraine,?,Lvov,22,8,Aeroflot,?,...,350202,0,Crashed after takeoff. Engine failure. A defec...,"Lviv,Kiev,Kuybyshev",17,11,6,17,11,6


In [5]:
geo_url = "https://raw.githubusercontent.com/Kang-Min-Ku/infoVis/main/smallWorld.geo.json"
geo_data = json.loads(requests.get(geo_url).text)
geo_country = [d["properties"]["name"] for d in geo_data["features"]]

In [55]:
for c in data["country"].unique():
    print(c)

United States of America
France
Canada
?
Germany
Belgium
Bulgaria
United Kingdom
Netherlands
Denmark
Italy
Colombia
Spain
Australia
China
Morocco
Romania
Russia
Hungary
Mauritania
Czechia
Algeria
Uruguay
Brazil
Greece
Mexico
Cuba
Iran
Madagascar
Argentina
New Zealand
Myanmar
South Africa
Thailand
Iraq
Chile
Lebanon
Bolivia
Slovenia
Ukraine
Peru
Syria
Pakistan
Egypt
Honduras
Switzerland
Panama
Poland
Trinidad and Tobago
Sweden
Norway
Croatia
Tajikistan
Jersey
Libya
Senegal
Gambia
Papua New Guinea
Sudan
Indonesia
India
Tanzania
Kenya
Japan
Mozambique
Ecuador
Malta
Estonia
Vietnam
Puerto Rico
Turkmenistan
Kazakhstan
Austria
Portugal
Suriname
Ireland
Serbia
Tuvalu
Bosnia and Herz.
Philippines
Bahamas
Nicaragua
Nigeria
Malaysia
Costa Rica
Sri Lanka
Venezuela
Iceland
Cameroon
Bahrain
Dominican Rep.
Isle of Man
Azerbaijan
Taiwan
Hong Kong
Oman
Turkey
South Korea
Angola
Marshall Is.
Bhutan
Central African Rep.
Liberia
Mali
North Macedonia
Guatemala
Congo
Chad
Bermuda
Jamaica
Georgia
North Kore

In [76]:
year_group_dict = {k:0 for k in range(1908, 2024+1)}
country_group_dict = {k:year_group_dict.copy() for k in data["country"].unique()}
for group in data.groupby("year"):
    group_df = group[1]
    for country in group_df["country"].unique():
        country_group_dict[country][group_df["year"].iloc[0]] = len(group_df[group_df["country"] == country])
    year_group_dict[group_df["year"].iloc[0]] = len(np.unique(group_df["country"]))
    print(group_df["year"].iloc[0],len(np.unique(group_df["country"])),np.unique(group_df["country"],return_counts=True))

1908 1 (array(['United States of America'], dtype=object), array([1]))
1909 1 (array(['France'], dtype=object), array([1]))
1912 1 (array(['United States of America'], dtype=object), array([1]))
1913 3 (array(['?', 'Canada', 'Germany'], dtype=object), array([1, 1, 1]))
1915 2 (array(['Belgium', 'Germany'], dtype=object), array([1, 1]))
1916 3 (array(['Bulgaria', 'Germany', 'United Kingdom'], dtype=object), array([1, 1, 3]))
1917 7 (array(['?', 'Belgium', 'Denmark', 'France', 'Germany', 'Netherlands',
       'United Kingdom'], dtype=object), array([1, 1, 1, 1, 1, 1, 1]))
1918 4 (array(['?', 'Germany', 'Netherlands', 'United States of America'],
      dtype=object), array([1, 1, 1, 1]))
1919 3 (array(['Italy', 'United Kingdom', 'United States of America'],
      dtype=object), array([1, 3, 5]))
1920 5 (array(['Colombia', 'France', 'Spain', 'United Kingdom',
       'United States of America'], dtype=object), array([ 1,  1,  2,  2, 12]))
1921 5 (array(['?', 'Australia', 'France', 'United K

In [77]:
successive_accident_occur = np.zeros((len(country_group_dict), len(year_group_dict)))
for idx, country in enumerate(country_group_dict.keys()):
    non_zero_idx = 0
    for idx2, year in enumerate(year_group_dict.keys()):
        if country_group_dict[country][year] == 0:
            non_zero_idx = -1
        elif country_group_dict[country][year] != 0:
            if non_zero_idx == -1:
                successive_accident_occur[idx, idx2] = 1
                non_zero_idx = idx2
            else:
                successive_accident_occur[idx, non_zero_idx:idx2+1] += 1

In [82]:
country_group_dict[list(country_group_dict.keys())[1]]

{1908: 0,
 1909: 1,
 1910: 0,
 1911: 0,
 1912: 0,
 1913: 0,
 1914: 0,
 1915: 0,
 1916: 0,
 1917: 1,
 1918: 0,
 1919: 0,
 1920: 1,
 1921: 2,
 1922: 3,
 1923: 2,
 1924: 1,
 1925: 3,
 1926: 0,
 1927: 3,
 1928: 5,
 1929: 1,
 1930: 2,
 1931: 0,
 1932: 0,
 1933: 1,
 1934: 2,
 1935: 3,
 1936: 4,
 1937: 2,
 1938: 3,
 1939: 2,
 1940: 1,
 1941: 1,
 1942: 0,
 1943: 2,
 1944: 4,
 1945: 4,
 1946: 6,
 1947: 4,
 1948: 5,
 1949: 2,
 1950: 4,
 1951: 3,
 1952: 1,
 1953: 2,
 1954: 2,
 1955: 0,
 1956: 3,
 1957: 1,
 1958: 2,
 1959: 1,
 1960: 0,
 1961: 1,
 1962: 5,
 1963: 3,
 1964: 0,
 1965: 0,
 1966: 0,
 1967: 1,
 1968: 2,
 1969: 0,
 1970: 1,
 1971: 2,
 1972: 1,
 1973: 4,
 1974: 1,
 1975: 0,
 1976: 0,
 1977: 1,
 1978: 0,
 1979: 1,
 1980: 1,
 1981: 1,
 1982: 0,
 1983: 0,
 1984: 0,
 1985: 0,
 1986: 0,
 1987: 1,
 1988: 3,
 1989: 2,
 1990: 0,
 1991: 0,
 1992: 1,
 1993: 1,
 1994: 1,
 1995: 1,
 1996: 1,
 1997: 0,
 1998: 1,
 1999: 0,
 2000: 2,
 2001: 1,
 2002: 0,
 2003: 1,
 2004: 0,
 2005: 0,
 2006: 0,
 2007: 1,


In [92]:
country_group_dict.keys()

dict_keys(['United States of America', 'France', 'Canada', '?', 'Germany', 'Belgium', 'Bulgaria', 'United Kingdom', 'Netherlands', 'Denmark', 'Italy', 'Colombia', 'Spain', 'Australia', 'China', 'Morocco', 'Romania', 'Russia', 'Hungary', 'Mauritania', 'Czechia', 'Algeria', 'Uruguay', 'Brazil', 'Greece', 'Mexico', 'Cuba', 'Iran', 'Madagascar', 'Argentina', 'New Zealand', 'Myanmar', 'South Africa', 'Thailand', 'Iraq', 'Chile', 'Lebanon', 'Bolivia', 'Slovenia', 'Ukraine', 'Peru', 'Syria', 'Pakistan', 'Egypt', 'Honduras', 'Switzerland', 'Panama', 'Poland', 'Trinidad and Tobago', 'Sweden', 'Norway', 'Croatia', 'Tajikistan', 'Jersey', 'Libya', 'Senegal', 'Gambia', 'Papua New Guinea', 'Sudan', 'Indonesia', 'India', 'Tanzania', 'Kenya', 'Japan', 'Mozambique', 'Ecuador', 'Malta', 'Estonia', 'Vietnam', 'Puerto Rico', 'Turkmenistan', 'Kazakhstan', 'Austria', 'Portugal', 'Suriname', 'Ireland', 'Serbia', 'Tuvalu', 'Bosnia and Herz.', 'Philippines', 'Bahamas', 'Nicaragua', 'Nigeria', 'Malaysia', 'Cos

In [93]:
df = pd.DataFrame(successive_accident_occur, columns=year_group_dict.keys())
df["country"] = list(country_group_dict.keys())
df.to_csv("../../dataset/successive_accident_occur.csv", index=False)

In [67]:
country_group_dict["United States of America"]

{1920: 12,
 1921: 6,
 1922: 2,
 1923: 5,
 1924: 3,
 1925: 2,
 1926: 3,
 1927: 6,
 1928: 14,
 1929: 18,
 1930: 10,
 1931: 15,
 1932: 16,
 1933: 13,
 1934: 9,
 1935: 16,
 1936: 13,
 1937: 6,
 1938: 9,
 1939: 4,
 1940: 4,
 1941: 4,
 1942: 6,
 1943: 10,
 1944: 9,
 1945: 17,
 1946: 22,
 1947: 21,
 1948: 11,
 1949: 13,
 1950: 11,
 1951: 19,
 1952: 14,
 1953: 17,
 1954: 12,
 1955: 19,
 1956: 12,
 1957: 8,
 1958: 13,
 1959: 18,
 1960: 14,
 1961: 11,
 1962: 10,
 1963: 18,
 1964: 19,
 1965: 17,
 1966: 10,
 1967: 13,
 1968: 17,
 1969: 17,
 1970: 16,
 1971: 14,
 1972: 12,
 1973: 10,
 1974: 13,
 1975: 9,
 1976: 10,
 1977: 14,
 1978: 15,
 1979: 17,
 1980: 6,
 1981: 10,
 1982: 10,
 1983: 14,
 1984: 10,
 1985: 17,
 1986: 9,
 1987: 13,
 1988: 8,
 1989: 18,
 1990: 10,
 1991: 15,
 1992: 11,
 1993: 7,
 1994: 16,
 1995: 8,
 1996: 12,
 1997: 9,
 1998: 4,
 1999: 5,
 2000: 15,
 2001: 15,
 2002: 2,
 2003: 15,
 2004: 10,
 2005: 4,
 2006: 5,
 2007: 7,
 2008: 13,
 2009: 8,
 2010: 5,
 2011: 5,
 2012: 0,
 2013: 5,


In [3]:
for group in data[data["month"]==1].groupby("day"):
    print(group[0], len(group[1]))

1 10
2 18
3 11
4 6
5 11
6 17
7 9
8 17
9 18
10 15
11 13
12 12
13 25
14 22
15 19
16 15
17 15
18 19
19 11
20 19
21 16
22 18
23 11
24 10
25 16
26 14
27 14
28 13
29 12
30 13
31 17
