In [2]:
import pandas as pd # for dataframes
from matplotlib import pyplot as plt # for simple plots
import seaborn as sns # for slightly more complicated plots
import numpy as np # for numerical computing and linear algebra data structures and algorithms
from numba import jit # for JIT compilation facilties for improved efficiency

In [3]:
# first let us clean some of the data that we just collected
earthquakes = pd.read_csv('./data/earthquakes.csv')

In [4]:
earthquakes.head()

Unnamed: 0,title,magnitude,place,time,long,lat
0,"M 4.2 - 4 km NNE of Jayapura, Indonesia",4.2,"4 km NNE of Jayapura, Indonesia",1672617572846,140.7278,-2.4906
1,"M 3.6 - 103 km N of Suárez, Puerto Rico",3.62,"103 km N of Suárez, Puerto Rico",1672617413930,-65.7256,19.3601
2,"M -0.8 - 85 km NNW of Karluk, Alaska",-0.76,"85 km NNW of Karluk, Alaska",1672617277550,-155.180333,58.2275
3,"M 0.6 - 10km NW of The Geysers, CA",0.57,"10km NW of The Geysers, CA",1672617223560,-122.8415,38.8445
4,M 2.3 -,2.28,,1672617148580,-66.8565,17.884833


In [8]:
# access columns of pandas dataframe by indexing
earthquakes['title'].head()

0    M 4.2 - 4 km NNE of Jayapura, Indonesia
1    M 3.6 - 103 km N of Suárez, Puerto Rico
2       M -0.8 - 85 km NNW of Karluk, Alaska
3         M 0.6 - 10km NW of The Geysers, CA
4                                   M 2.3 - 
Name: title, dtype: object

In [9]:
earthquakes[['title', 'magnitude']].head()

Unnamed: 0,title,magnitude
0,"M 4.2 - 4 km NNE of Jayapura, Indonesia",4.2
1,"M 3.6 - 103 km N of Suárez, Puerto Rico",3.62
2,"M -0.8 - 85 km NNW of Karluk, Alaska",-0.76
3,"M 0.6 - 10km NW of The Geysers, CA",0.57
4,M 2.3 -,2.28


In [10]:
# loc allows access by row index - we can change the row index, but by default is row number
earthquakes.loc[0]

title        M 4.2 - 4 km NNE of Jayapura, Indonesia
magnitude                                        4.2
place                4 km NNE of Jayapura, Indonesia
time                                   1672617572846
long                                        140.7278
lat                                          -2.4906
Name: 0, dtype: object

In [11]:
earthquakes.loc[0, ['time', 'long']] # can get more arbitrary slices as need be

time    1672617572846
long         140.7278
Name: 0, dtype: object

In [12]:
earthquakes.loc[[0, 1], ['time', 'long']]

Unnamed: 0,time,long
0,1672617572846,140.7278
1,1672617413930,-65.7256


In [13]:
# can perform operations on all rows of column through vectorization
earthquakes['magnitude'] * 2

0      8.40
1      7.24
2     -1.52
3      1.14
4      4.56
       ... 
332    1.68
333    3.40
334    2.40
335    1.38
336    0.18
Name: magnitude, Length: 337, dtype: float64

In [14]:
# useful for filtering
above_5 = earthquakes['magnitude'] > 5
earthquakes_above_5 = earthquakes[above_5]

In [15]:
earthquakes_above_5

Unnamed: 0,title,magnitude,place,time,long,lat
83,"M 5.4 - 15km SE of Rio Dell, CA",5.35,"15km SE of Rio Dell, CA",1672598104510,-123.971,40.409
87,M 5.8 - southern East Pacific Rise,5.8,southern East Pacific Rise,1672597602715,-123.4337,-56.2046
88,"M 5.5 - 10 km E of Jayapura, Indonesia",5.5,"10 km E of Jayapura, Indonesia",1672597469990,140.8085,-2.5298
122,M 5.3 - south of the Fiji Islands,5.3,south of the Fiji Islands,1672590266828,178.5519,-25.1974
247,M 5.2 - South Sandwich Islands region,5.2,South Sandwich Islands region,1672556581555,-25.5621,-58.8341
301,"M 5.4 - Bougainville region, Papua New Guinea",5.4,"Bougainville region, Papua New Guinea",1672542976153,155.1723,-6.6469


In [5]:
# one of the fist things to do with a dataframe is to check the types
earthquakes.dtypes

title         object
magnitude    float64
place         object
time           int64
long         float64
lat          float64
dtype: object

In [6]:
# all of the types above seem appropriate, so have little to correct there. After checking types, it is useful to check the missing values
def produce_missing_report(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    return missing_value_df

In [7]:
produce_missing_report(earthquakes)

Unnamed: 0,column_name,percent_missing
title,title,0.0
magnitude,magnitude,0.296736
place,place,6.52819
time,time,0.0
long,long,0.0
lat,lat,0.0


In [27]:
# whether we drop rows or columns with missing values is domain and problem dependent. Suppose that we want to analyse earthquake magnitudes within countries
# hence, rows without magnitude and places hold no value - we can drop them

missing_magnitude = earthquakes['magnitude'].isnull() # get rows with magnitude is null
missing_place = earthquakes['place'].isnull() # get rows with place is null
at_least_one_missing = missing_magnitude | missing_place
earthquakes_cleaned = earthquakes[~at_least_one_missing] # ~ operation is boolean negation

In [28]:
# suppose that we want to convert timestamp to actual time? Can use datetime function
from datetime import datetime
dt = datetime.fromtimestamp(1672598104510 / 1000)
print(dt)

2023-01-01 14:35:04.510000


In [29]:
dt.hour

14

In [30]:
def get_hour(timestamp):
    dt = datetime.fromtimestamp(timestamp / 1000)
    return dt.hour

In [31]:
# our get_hour function is not inherently vectorized, so we can apply to each row using apply method on dataframe
earthquakes_cleaned['hour'] = earthquakes_cleaned['time'].apply(get_hour)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  earthquakes_cleaned['hour'] = earthquakes_cleaned['time'].apply(get_hour)


In [32]:
earthquakes_cleaned.head()

Unnamed: 0,title,magnitude,place,time,long,lat,hour
0,"M 4.2 - 4 km NNE of Jayapura, Indonesia",4.2,"4 km NNE of Jayapura, Indonesia",1672617572846,140.7278,-2.4906,19
1,"M 3.6 - 103 km N of Suárez, Puerto Rico",3.62,"103 km N of Suárez, Puerto Rico",1672617413930,-65.7256,19.3601,19
2,"M -0.8 - 85 km NNW of Karluk, Alaska",-0.76,"85 km NNW of Karluk, Alaska",1672617277550,-155.180333,58.2275,19
3,"M 0.6 - 10km NW of The Geysers, CA",0.57,"10km NW of The Geysers, CA",1672617223560,-122.8415,38.8445,19
5,"M 1.2 - 3km NNW of Fontana, CA",1.2,"3km NNW of Fontana, CA",1672616861690,-117.470833,34.121,19


In [39]:
# place gives our country + extra info - but we just want country
def extract_country(place):
    components = place.split(',')
    country = components[-1].strip()
    return country

earthquakes_cleaned['country'] = earthquakes_cleaned['place'].apply(extract_country)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  earthquakes_cleaned['country'] = earthquakes_cleaned['place'].apply(extract_country)


In [40]:
earthquakes_cleaned

Unnamed: 0,title,magnitude,place,time,long,lat,hour,country
0,"M 4.2 - 4 km NNE of Jayapura, Indonesia",4.20,"4 km NNE of Jayapura, Indonesia",1672617572846,140.727800,-2.490600,19,Indonesia
1,"M 3.6 - 103 km N of Suárez, Puerto Rico",3.62,"103 km N of Suárez, Puerto Rico",1672617413930,-65.725600,19.360100,19,Puerto Rico
2,"M -0.8 - 85 km NNW of Karluk, Alaska",-0.76,"85 km NNW of Karluk, Alaska",1672617277550,-155.180333,58.227500,19,Alaska
3,"M 0.6 - 10km NW of The Geysers, CA",0.57,"10km NW of The Geysers, CA",1672617223560,-122.841500,38.844500,19,CA
5,"M 1.2 - 3km NNW of Fontana, CA",1.20,"3km NNW of Fontana, CA",1672616861690,-117.470833,34.121000,19,CA
...,...,...,...,...,...,...,...,...
332,"M 0.8 - 14km N of Warner Springs, CA",0.84,"14km N of Warner Springs, CA",1672533142630,-116.616833,33.408000,20,CA
333,"M 1.7 - 31 km NE of Paxson, Alaska",1.70,"31 km NE of Paxson, Alaska",1672532746311,-145.178900,63.297000,20,Alaska
334,"M 1.2 - 7 km NNW of Meadow Lakes, Alaska",1.20,"7 km NNW of Meadow Lakes, Alaska",1672532015730,-149.669200,61.685400,20,Alaska
335,"M 0.7 - 1km NNE of The Geysers, CA",0.69,"1km NNE of The Geysers, CA",1672531625430,-122.754500,38.783500,20,CA


In [41]:
earthquakes_cleaned['country'].value_counts() # useful to get counts of categorical variables

Alaska                            111
CA                                102
Hawaii                             24
Puerto Rico                        15
Indonesia                           6
Nevada                              5
Puerto Rico region                  4
Texas                               4
Alaska Peninsula                    3
Philippines                         3
New Mexico                          2
Washington                          2
Oklahoma                            2
Central Alaska                      2
Southern Alaska                     2
Papua New Guinea                    2
Utah                                1
Wyoming                             1
south of the Fiji Islands           1
California                          1
China                               1
Bay of Bengal                       1
Missouri                            1
South Sandwich Islands region       1
northwest of the Kuril Islands      1
Afghanistan                         1
Mexico      

In [42]:
# we notice that US states are listed, and inconsistently - depending on the analysis, the state listing is fine, but not the inconsistency. Alaska is coming up a lot
# there is a package called fuzzywuzzy that can help in much more complicated instances
countries = set(earthquakes_cleaned['country'])

In [43]:
countries

{'Afghanistan',
 'Alaska',
 'Alaska Peninsula',
 'Azores Islands region',
 'Bay of Bengal',
 'Bolivia',
 'CA',
 'California',
 'Canada',
 'Central Alaska',
 'China',
 'Dominican Republic',
 'Fiji region',
 'Hawaii',
 'Idaho',
 'Indonesia',
 'Japan',
 'Japan region',
 'Mexico',
 'Missouri',
 'Nevada',
 'New Mexico',
 'Northern Mariana Islands',
 'Oklahoma',
 'Pakistan',
 'Papua New Guinea',
 'Philippines',
 'Puerto Rico',
 'Puerto Rico region',
 'South Sandwich Islands region',
 'Southern Alaska',
 'Tajikistan',
 'Texas',
 'Utah',
 'Vanuatu',
 'Washington',
 'Wyoming',
 'northwest of the Kuril Islands',
 'south of the Fiji Islands',
 'southern East Pacific Rise',
 'western Texas'}

In [44]:
# if contains Alaska then map to Alaska
# map CA to California
# if region, map to country instead - remove the word region
# remove all words starting with common letters

def repair_countries(country):
    if 'Alaska' in country:
        return 'Alaska'
    if country == 'CA':
        return 'California'
    components = country.split()
    corrected = []
    for word in components:
        if word[0] != word[0].lower():
            corrected.append(word)
    return ' '.join(corrected)

earthquakes_cleaned['country'] = earthquakes_cleaned['country'].apply(repair_countries)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  earthquakes_cleaned['country'] = earthquakes_cleaned['country'].apply(repair_countries)


In [45]:
earthquakes_cleaned

Unnamed: 0,title,magnitude,place,time,long,lat,hour,country
0,"M 4.2 - 4 km NNE of Jayapura, Indonesia",4.20,"4 km NNE of Jayapura, Indonesia",1672617572846,140.727800,-2.490600,19,Indonesia
1,"M 3.6 - 103 km N of Suárez, Puerto Rico",3.62,"103 km N of Suárez, Puerto Rico",1672617413930,-65.725600,19.360100,19,Puerto Rico
2,"M -0.8 - 85 km NNW of Karluk, Alaska",-0.76,"85 km NNW of Karluk, Alaska",1672617277550,-155.180333,58.227500,19,Alaska
3,"M 0.6 - 10km NW of The Geysers, CA",0.57,"10km NW of The Geysers, CA",1672617223560,-122.841500,38.844500,19,California
5,"M 1.2 - 3km NNW of Fontana, CA",1.20,"3km NNW of Fontana, CA",1672616861690,-117.470833,34.121000,19,California
...,...,...,...,...,...,...,...,...
332,"M 0.8 - 14km N of Warner Springs, CA",0.84,"14km N of Warner Springs, CA",1672533142630,-116.616833,33.408000,20,California
333,"M 1.7 - 31 km NE of Paxson, Alaska",1.70,"31 km NE of Paxson, Alaska",1672532746311,-145.178900,63.297000,20,Alaska
334,"M 1.2 - 7 km NNW of Meadow Lakes, Alaska",1.20,"7 km NNW of Meadow Lakes, Alaska",1672532015730,-149.669200,61.685400,20,Alaska
335,"M 0.7 - 1km NNE of The Geysers, CA",0.69,"1km NNE of The Geysers, CA",1672531625430,-122.754500,38.783500,20,California


In [46]:
earthquakes_cleaned['country'].value_counts()

Alaska                      118
California                  103
Hawaii                       24
Puerto Rico                  19
Indonesia                     6
Texas                         5
Nevada                        5
Philippines                   3
Oklahoma                      2
Japan                         2
Papua New Guinea              2
Washington                    2
New Mexico                    2
Missouri                      1
China                         1
South Sandwich Islands        1
Utah                          1
Fiji Islands                  1
Wyoming                       1
Afghanistan                   1
Bay Bengal                    1
Kuril Islands                 1
Mexico                        1
Vanuatu                       1
East Pacific Rise             1
Azores Islands                1
Northern Mariana Islands      1
Pakistan                      1
Bolivia                       1
Tajikistan                    1
Canada                        1
Fiji    

In [51]:
# now that the data is cleaned for our analysis, we can save it in a appropriate format
# we can use CSVs, but we can also use formats such as .feather
earthquakes_cleaned.reset_index().to_feather('./data/earthquakes_cleaned.feather')

In [72]:
landslides = pd.read_csv('./data/catalog.csv')

In [73]:
landslides.head(10)

Unnamed: 0,id,date,time,continent_code,country_name,country_code,state/province,population,city/town,distance,...,geolocation,hazard_type,landslide_type,landslide_size,trigger,storm_name,injuries,fatalities,source_name,source_link
0,34,3/2/07,Night,,United States,US,Virginia,16000,Cherry Hill,3.40765,...,"(38.600900000000003, -77.268199999999993)",Landslide,Landslide,Small,Rain,,,,NBC 4 news,http://www.nbc4.com/news/11186871/detail.html
1,42,3/22/07,,,United States,US,Ohio,17288,New Philadelphia,3.33522,...,"(40.517499999999998, -81.430499999999995)",Landslide,Landslide,Small,Rain,,,,Canton Rep.com,http://www.cantonrep.com/index.php?ID=345054&C...
2,56,4/6/07,,,United States,US,Pennsylvania,15930,Wilkinsburg,2.91977,...,"(40.4377, -79.915999999999997)",Landslide,Landslide,Small,Rain,,,,The Pittsburgh Channel.com,https://web.archive.org/web/20080423132842/htt...
3,59,4/14/07,,,Canada,CA,Quebec,42786,Châteauguay,2.98682,...,"(45.322600000000001, -73.777100000000004)",Landslide,Riverbank collapse,Small,Rain,,,,Le Soleil,http://www.hebdos.net/lsc/edition162007/articl...
4,61,4/15/07,,,United States,US,Kentucky,6903,Pikeville,5.66542,...,"(37.432499999999997, -82.493099999999998)",Landslide,Landslide,Small,Downpour,,,0.0,Matthew Crawford (KGS),
5,64,4/20/07,,,United States,US,Kentucky,6903,Pikeville,0.23715,...,"(37.481400000000001, -82.518600000000006)",Landslide,Landslide,Small,Rain,,,,Applalachain news-express,http://www.news-expressky.com/articles/2007/04...
6,67,4/24/07,,,United States,US,South Dakota,2540,Dakota Dunes,2.48033,...,"(42.494100000000003, -96.457599999999999)",Landslide,Landslide,Small,Rain,,,,Sioux City Journnal,http://www.siouxcityjournal.com/articles/2007/...
7,77,5/21/07,,SA,Colombia,CO,Risaralda,440118,Pereira,0.62022,...,"(4.8080999999999996, -75.694100000000006)",Landslide,Mudslide,Large,Rain,,,13.0,Reuters - AlertNet.org,http://www.reuters.com/news/video/videoStory?v...
8,105,6/27/07,,SA,Ecuador,EC,Zamora-Chinchipe,15276,Zamora,0.47714,...,"(-4.0650000000000004, -78.950999999999993)",Landslide,Landslide,Medium,Downpour,,,,Red Cross - Field reports,https://www-secure.ifrc.org/dmis/prepare/view_...
9,106,6/27/07,,SA,Ecuador,EC,Loja,117796,Loja,0.35649,...,"(-3.99, -79.204999999999998)",Landslide,Landslide,Medium,Downpour,,,,Red Cross - Field reports,https://www-secure.ifrc.org/dmis/prepare/view_...


In [74]:
landslides.dtypes

id                        int64
date                     object
time                     object
continent_code           object
country_name             object
country_code             object
state/province           object
population                int64
city/town                object
distance                float64
location_description     object
latitude                float64
longitude               float64
geolocation              object
hazard_type              object
landslide_type           object
landslide_size           object
trigger                  object
storm_name               object
injuries                float64
fatalities              float64
source_name              object
source_link              object
dtype: object

In [75]:
produce_missing_report(landslides)

Unnamed: 0,column_name,percent_missing
id,id,0.0
date,date,0.1772
time,time,62.847017
continent_code,continent_code,90.313054
country_name,country_name,0.0
country_code,country_code,0.0
state/province,state/province,0.059067
population,population,0.0
city/town,city/town,0.236267
distance,distance,0.059067


In [76]:
to_drop = ['storm_name', 'time', 'location_description']
landslides = landslides.drop(to_drop, axis=1)

In [77]:
# sometimes, we don't want to drop rows with missing, but to impute instead
# imputation can be done:
# statistically - using mode, median, mean, or regression (handling missing data like this is a VERY complicated topic)
# domain knowledge - use domain knowledge to decide on default value - here we can consider a NaN in fatalies as meaning 0 fatalities

landslides['fatalities'] = landslides['fatalities'].fillna(0)
landslides['injuries'] = landslides['injuries'].fillna(0)

In [78]:
landslides.head()

Unnamed: 0,id,date,continent_code,country_name,country_code,state/province,population,city/town,distance,latitude,longitude,geolocation,hazard_type,landslide_type,landslide_size,trigger,injuries,fatalities,source_name,source_link
0,34,3/2/07,,United States,US,Virginia,16000,Cherry Hill,3.40765,38.6009,-77.2682,"(38.600900000000003, -77.268199999999993)",Landslide,Landslide,Small,Rain,0.0,0.0,NBC 4 news,http://www.nbc4.com/news/11186871/detail.html
1,42,3/22/07,,United States,US,Ohio,17288,New Philadelphia,3.33522,40.5175,-81.4305,"(40.517499999999998, -81.430499999999995)",Landslide,Landslide,Small,Rain,0.0,0.0,Canton Rep.com,http://www.cantonrep.com/index.php?ID=345054&C...
2,56,4/6/07,,United States,US,Pennsylvania,15930,Wilkinsburg,2.91977,40.4377,-79.916,"(40.4377, -79.915999999999997)",Landslide,Landslide,Small,Rain,0.0,0.0,The Pittsburgh Channel.com,https://web.archive.org/web/20080423132842/htt...
3,59,4/14/07,,Canada,CA,Quebec,42786,Châteauguay,2.98682,45.3226,-73.7771,"(45.322600000000001, -73.777100000000004)",Landslide,Riverbank collapse,Small,Rain,0.0,0.0,Le Soleil,http://www.hebdos.net/lsc/edition162007/articl...
4,61,4/15/07,,United States,US,Kentucky,6903,Pikeville,5.66542,37.4325,-82.4931,"(37.432499999999997, -82.493099999999998)",Landslide,Landslide,Small,Downpour,0.0,0.0,Matthew Crawford (KGS),


In [79]:
landslides['continent_code'].value_counts()

SA    164
Name: continent_code, dtype: int64

In [80]:
landslides['country_name'].value_counts()

United States                       986
Colombia                             96
Guatemala                            79
Costa Rica                           75
Mexico                               73
Trinidad and Tobago                  63
Panama                               40
Honduras                             38
Jamaica                              33
Nicaragua                            31
Ecuador                              30
El Salvador                          22
Venezuela                            20
Haiti                                20
Dominican Republic                   15
Dominica                             14
Peru                                 14
Canada                               10
Saint Lucia                           8
Puerto Rico                           7
Brazil                                4
Saint Vincent and the Grenadines      4
Cuba                                  3
U.S. Virgin Islands                   2
Grenada                               2


In [81]:
landslides.loc[landslides['continent_code'].isna(), 'country_name'].value_counts()

United States                       986
Guatemala                            79
Costa Rica                           75
Mexico                               73
Trinidad and Tobago                  63
Panama                               40
Honduras                             38
Jamaica                              33
Nicaragua                            31
El Salvador                          22
Haiti                                20
Dominican Republic                   15
Dominica                             14
Canada                               10
Saint Lucia                           8
Puerto Rico                           7
Saint Vincent and the Grenadines      4
Cuba                                  3
U.S. Virgin Islands                   2
Grenada                               2
Bermuda                               2
Barbados                              1
Belize                                1
Name: country_name, dtype: int64

In [82]:
landslides['continent_code'] = landslides['continent_code'].fillna('NA')

In [83]:
landslides['date'] = pd.to_datetime(landslides['date'], format='%m/%d/%y')

In [84]:
landslides.head()

Unnamed: 0,id,date,continent_code,country_name,country_code,state/province,population,city/town,distance,latitude,longitude,geolocation,hazard_type,landslide_type,landslide_size,trigger,injuries,fatalities,source_name,source_link
0,34,2007-03-02,,United States,US,Virginia,16000,Cherry Hill,3.40765,38.6009,-77.2682,"(38.600900000000003, -77.268199999999993)",Landslide,Landslide,Small,Rain,0.0,0.0,NBC 4 news,http://www.nbc4.com/news/11186871/detail.html
1,42,2007-03-22,,United States,US,Ohio,17288,New Philadelphia,3.33522,40.5175,-81.4305,"(40.517499999999998, -81.430499999999995)",Landslide,Landslide,Small,Rain,0.0,0.0,Canton Rep.com,http://www.cantonrep.com/index.php?ID=345054&C...
2,56,2007-04-06,,United States,US,Pennsylvania,15930,Wilkinsburg,2.91977,40.4377,-79.916,"(40.4377, -79.915999999999997)",Landslide,Landslide,Small,Rain,0.0,0.0,The Pittsburgh Channel.com,https://web.archive.org/web/20080423132842/htt...
3,59,2007-04-14,,Canada,CA,Quebec,42786,Châteauguay,2.98682,45.3226,-73.7771,"(45.322600000000001, -73.777100000000004)",Landslide,Riverbank collapse,Small,Rain,0.0,0.0,Le Soleil,http://www.hebdos.net/lsc/edition162007/articl...
4,61,2007-04-15,,United States,US,Kentucky,6903,Pikeville,5.66542,37.4325,-82.4931,"(37.432499999999997, -82.493099999999998)",Landslide,Landslide,Small,Downpour,0.0,0.0,Matthew Crawford (KGS),


In [86]:
landslides['landslide_size'].value_counts()

Medium        965
Small         636
Large          68
Very_large     15
medium          3
small           3
large           2
Name: landslide_size, dtype: int64

In [87]:
landslides['trigger'].value_counts()

Downpour                   866
Rain                       446
Tropical cyclone           131
Continuous rain             95
Unknown                     88
unknown                     12
Snowfall snowmelt           11
Flooding                    10
Earthquake                   8
Mining digging               7
Freeze thaw                  6
Construction                 5
Other                        3
Dam embankment collapse      1
downpour                     1
Volcano                      1
Name: trigger, dtype: int64

In [88]:
landslides['hazard_type'].value_counts()

Landslide    1693
Name: hazard_type, dtype: int64

In [94]:
landslides['landslide_type'].value_counts()

Landslide             866
Mudslide              635
Rockfall               70
Complex                48
Debris flow            38
Other                  14
Riverbank collapse      5
Creep                   5
mudslide                4
Lahar                   2
Snow avalanche          2
Unknown                 1
Rockslide               1
landslide               1
Name: landslide_type, dtype: int64

In [95]:
#landslides.drop(['hazard_type'], axis=1, inplace=True)
landslides['trigger'] = landslides['trigger'].str.upper() # we normalise to upper case to ensure homogenity
landslides['landslide_size'] = landslides['landslide_size'].str.upper()
landslides['landslide_type'] = landslides['landslide_type'].str.upper()

In [96]:
landslides

Unnamed: 0,id,date,continent_code,country_name,country_code,state/province,population,city/town,distance,latitude,longitude,geolocation,landslide_type,landslide_size,trigger,injuries,fatalities,source_name,source_link
0,34,2007-03-02,,United States,US,Virginia,16000,Cherry Hill,3.40765,38.6009,-77.2682,"(38.600900000000003, -77.268199999999993)",LANDSLIDE,SMALL,RAIN,0.0,0.0,NBC 4 news,http://www.nbc4.com/news/11186871/detail.html
1,42,2007-03-22,,United States,US,Ohio,17288,New Philadelphia,3.33522,40.5175,-81.4305,"(40.517499999999998, -81.430499999999995)",LANDSLIDE,SMALL,RAIN,0.0,0.0,Canton Rep.com,http://www.cantonrep.com/index.php?ID=345054&C...
2,56,2007-04-06,,United States,US,Pennsylvania,15930,Wilkinsburg,2.91977,40.4377,-79.9160,"(40.4377, -79.915999999999997)",LANDSLIDE,SMALL,RAIN,0.0,0.0,The Pittsburgh Channel.com,https://web.archive.org/web/20080423132842/htt...
3,59,2007-04-14,,Canada,CA,Quebec,42786,Châteauguay,2.98682,45.3226,-73.7771,"(45.322600000000001, -73.777100000000004)",RIVERBANK COLLAPSE,SMALL,RAIN,0.0,0.0,Le Soleil,http://www.hebdos.net/lsc/edition162007/articl...
4,61,2007-04-15,,United States,US,Kentucky,6903,Pikeville,5.66542,37.4325,-82.4931,"(37.432499999999997, -82.493099999999998)",LANDSLIDE,SMALL,DOWNPOUR,0.0,0.0,Matthew Crawford (KGS),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,7535,2015-12-07,,United States,US,North Carolina,1646,Tryon,7.80866,35.2219,-82.3226,"(35.221899999999998, -82.322599999999994)",ROCKFALL,SMALL,,0.0,0.0,Tryon Daily Bulletin,http://www.tryondailybulletin.com/2016/02/26/u...
1689,7537,2016-02-22,,United States,US,West Virginia,51400,Charleston,6.84721,38.3987,-81.5848,"(38.398699999999998, -81.584800000000001)",MUDSLIDE,SMALL,UNKNOWN,0.0,0.0,Charleston Gazette,http://www.wvgazettemail.com/news/20160222/us-...
1690,7539,2016-02-23,,United States,US,West Virginia,2406,Welch,14.19735,37.4096,-81.4268,"(37.409599999999998, -81.4268)",LANDSLIDE,SMALL,RAIN,0.0,0.0,Bluefield Daily Telegraph,http://www.bdtonline.com/news/officials-cautio...
1691,7540,2016-02-26,,United States,US,West Virginia,1048,Athens,12.00678,37.5011,-81.1093,"(37.501100000000001, -81.109300000000005)",ROCKFALL,SMALL,UNKNOWN,0.0,0.0,Bluefield Daily Telegraph,http://www.bdtonline.com/news/rockslide-snarls...


In [97]:
landslides.reset_index().to_feather('./data/landslides_cleaned.feather')

In [108]:
mazda = pd.read_csv('./data/mazda_safety.csv')
toyota = pd.read_csv('./data/toyota_safety.csv')

In [109]:
mazda

Unnamed: 0,adult_occupant_protection,child_occupant_protection,vulnerable_road_user_protection,safety_assist,name,num_stars,rating_year,url
0,86%,89%,67%,84%,Mazda BT-50,5,Jul 2022 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
1,93%,87%,68%,74%,Mazda MX-30,5,Jan 2021 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
2,98%,89%,81%,76%,Mazda 3,5,Apr 2019 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
3,99%,88%,80%,76%,Mazda CX-30,5,Feb 2020 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
4,95%,91%,66%,73%,Mazda 6,5,Jun 2018 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
5,96%,87%,72%,73%,Mazda CX-8,5,Jul 2018 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
6,95%,80%,78%,59%,Mazda CX-5,5,Apr 2017 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
7,,,,,Mazda CX-9,5,Jul 2016 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
8,,,,,Mazda MX-5,5,Sep 2015 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
9,,,,,Mazda 2,0,Jan 2023 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...


In [110]:
toyota

Unnamed: 0,adult_occupant_protection,child_occupant_protection,vulnerable_road_user_protection,safety_assist,name,num_stars,rating_year,url
0,85%,88%,87%,83%,Toyota Corolla Cross,5,Jul 2022 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
1,89%,88%,81%,77%,Toyota Landcruiser,5,Jul 2021 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
2,90%,88%,76%,82%,Toyota Kluger / Highlander,5,Mar 2021 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
3,88%,87%,80%,83%,Toyota Mirai,5,Nov 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
4,86%,86%,78%,82%,Toyota Yaris Cross,5,Aug 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
5,86%,87%,78%,87%,Toyota Yaris,5,May 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
6,95%,84%,88%,78%,Toyota Fortuner,5,Oct 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
7,94%,88%,84%,79%,Toyota Granvia,5,Oct 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
8,94%,88%,84%,77%,Toyota Hiace,5,May 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
9,96%,87%,88%,78%,Toyota Hilux,5,Jul 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...


In [102]:
mazda.dtypes

adult_occupant_protection          object
child_occupant_protection          object
vulnerable_road_user_protection    object
safety_assist                      object
name                               object
num_stars                           int64
rating_year                        object
url                                object
dtype: object

In [103]:
toyota.dtypes

adult_occupant_protection          object
child_occupant_protection          object
vulnerable_road_user_protection    object
safety_assist                      object
name                               object
num_stars                           int64
rating_year                        object
url                                object
dtype: object

In [111]:
def clean_protection_rating(value):
    if type(value) not in [int, float] and len(value):
        return int(value[:-1])
    return None

def clean_ratings(df):
    columns = [
        'adult_occupant_protection',
        'child_occupant_protection',
        'vulnerable_road_user_protection',
        'safety_assist'
    ]
    for col in columns:
        df[col] = df[col].apply(clean_protection_rating)
    return df

mazda = clean_ratings(mazda)
toyota = clean_ratings(toyota)

In [112]:
mazda

Unnamed: 0,adult_occupant_protection,child_occupant_protection,vulnerable_road_user_protection,safety_assist,name,num_stars,rating_year,url
0,86.0,89.0,67.0,84.0,Mazda BT-50,5,Jul 2022 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
1,93.0,87.0,68.0,74.0,Mazda MX-30,5,Jan 2021 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
2,98.0,89.0,81.0,76.0,Mazda 3,5,Apr 2019 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
3,99.0,88.0,80.0,76.0,Mazda CX-30,5,Feb 2020 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
4,95.0,91.0,66.0,73.0,Mazda 6,5,Jun 2018 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
5,96.0,87.0,72.0,73.0,Mazda CX-8,5,Jul 2018 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
6,95.0,80.0,78.0,59.0,Mazda CX-5,5,Apr 2017 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
7,,,,,Mazda CX-9,5,Jul 2016 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
8,,,,,Mazda MX-5,5,Sep 2015 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
9,,,,,Mazda 2,0,Jan 2023 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...


In [113]:
toyota

Unnamed: 0,adult_occupant_protection,child_occupant_protection,vulnerable_road_user_protection,safety_assist,name,num_stars,rating_year,url
0,85.0,88.0,87.0,83.0,Toyota Corolla Cross,5,Jul 2022 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
1,89.0,88.0,81.0,77.0,Toyota Landcruiser,5,Jul 2021 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
2,90.0,88.0,76.0,82.0,Toyota Kluger / Highlander,5,Mar 2021 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
3,88.0,87.0,80.0,83.0,Toyota Mirai,5,Nov 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
4,86.0,86.0,78.0,82.0,Toyota Yaris Cross,5,Aug 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
5,86.0,87.0,78.0,87.0,Toyota Yaris,5,May 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
6,95.0,84.0,88.0,78.0,Toyota Fortuner,5,Oct 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
7,94.0,88.0,84.0,79.0,Toyota Granvia,5,Oct 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
8,94.0,88.0,84.0,77.0,Toyota Hiace,5,May 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
9,96.0,87.0,88.0,78.0,Toyota Hilux,5,Jul 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...


In [114]:
mazda = mazda[~mazda['adult_occupant_protection'].isna()]
toyota = toyota[~toyota['adult_occupant_protection'].isna()]

In [115]:
mazda

Unnamed: 0,adult_occupant_protection,child_occupant_protection,vulnerable_road_user_protection,safety_assist,name,num_stars,rating_year,url
0,86.0,89.0,67.0,84.0,Mazda BT-50,5,Jul 2022 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
1,93.0,87.0,68.0,74.0,Mazda MX-30,5,Jan 2021 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
2,98.0,89.0,81.0,76.0,Mazda 3,5,Apr 2019 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
3,99.0,88.0,80.0,76.0,Mazda CX-30,5,Feb 2020 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
4,95.0,91.0,66.0,73.0,Mazda 6,5,Jun 2018 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
5,96.0,87.0,72.0,73.0,Mazda CX-8,5,Jul 2018 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...
6,95.0,80.0,78.0,59.0,Mazda CX-5,5,Apr 2017 - onwards,https://www.ancap.com.au/safety-ratings/mazda/...


In [116]:
toyota

Unnamed: 0,adult_occupant_protection,child_occupant_protection,vulnerable_road_user_protection,safety_assist,name,num_stars,rating_year,url
0,85.0,88.0,87.0,83.0,Toyota Corolla Cross,5,Jul 2022 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
1,89.0,88.0,81.0,77.0,Toyota Landcruiser,5,Jul 2021 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
2,90.0,88.0,76.0,82.0,Toyota Kluger / Highlander,5,Mar 2021 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
3,88.0,87.0,80.0,83.0,Toyota Mirai,5,Nov 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
4,86.0,86.0,78.0,82.0,Toyota Yaris Cross,5,Aug 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
5,86.0,87.0,78.0,87.0,Toyota Yaris,5,May 2020 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
6,95.0,84.0,88.0,78.0,Toyota Fortuner,5,Oct 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
7,94.0,88.0,84.0,79.0,Toyota Granvia,5,Oct 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
8,94.0,88.0,84.0,77.0,Toyota Hiace,5,May 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...
9,96.0,87.0,88.0,78.0,Toyota Hilux,5,Jul 2019 - onwards,https://www.ancap.com.au/safety-ratings/toyota...


In [117]:
mazda.reset_index().to_csv('./data/mazda_cleaned.csv', index=False)
toyota.reset_index().to_csv('./data/toyota_cleaned.csv', index=False)