In [24]:
import pandas as pd

### Load Dataframe

In [25]:
df = pd.read_csv("./other_data/gempa_indonesia.csv")
df = df.drop(["strike1", "dip1", "rake1", "strike2", "dip2", "rake2"], axis=1)
df.head(5)

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark
0,2008/11/01,21:02:43.058,-9.18,119.06,10,4.9,Sumba Region - Indonesia
1,2008/11/01,20:58:50.248,-6.55,129.64,10,4.6,Banda Sea
2,2008/11/01,17:43:12.941,-7.01,106.63,121,3.7,Java - Indonesia
3,2008/11/01,16:24:14.755,-3.3,127.85,10,3.2,Seram - Indonesia
4,2008/11/01,16:20:37.327,-6.41,129.54,70,4.3,Banda Sea


In [26]:
# Check data type
df.dtypes

tgl        object
ot         object
lat       float64
lon       float64
depth       int64
mag       float64
remark     object
dtype: object

### Data Cleaning

In [27]:
# Uniform remark value
df["remark"] = df["remark"].apply(lambda x: x.split("-")[0].strip() if "-" in x else x)

In [28]:
# Check for duplicate
df[df.duplicated()]

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark
8740,2011/06/27,16:47:15.707,-9.28,122.46,139,5.7,Savu Sea
8802,2011/07/04,05:17:37.048,4.37,97.52,10,3.4,Northern Sumatra
8805,2011/07/05,19:09:11.444,1.45,96.95,13,4.8,Off West Coast of Northern Sumatra
8911,2011/07/19,03:18:38.252,-1.27,137.83,10,5.1,Near North Coast of Irian Jaya
9027,2011/08/06,14:43:30.702,-3.11,130.92,10,4.8,Seram
...,...,...,...,...,...,...,...
92800,2023/01/24,02:13:19.875,2.84,127.04,52,5.9,Northern Molucca Sea
92802,2023/01/24,01:10:25.606,2.81,127.03,10,5.0,Northern Molucca Sea
92806,2023/01/25,19:10:59.342,-10.76,111.74,23,4.6,South of Java
92865,2023/01/26,06:33:44.219,2.93,127.03,10,4.8,Northern Molucca Sea


In [29]:
# Remove duplicates
df = df.drop_duplicates().reset_index(drop=True)
df[df.duplicated()]

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark


In [30]:
# Check data type consistency
check_column = ["lat", "lon", "depth", "mag"]
df[~df[check_column].apply(lambda x: x.apply(lambda y: isinstance(y, (int, float)))).all(axis=1)]

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark


In [35]:
# Checking for missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark


In [36]:
# Statistical checks for outliers
df.describe()

Unnamed: 0,lat,lon,depth,mag
count,90152.0,90152.0,90152.0,90152.0
mean,-3.412285,119.175422,48.386126,3.550424
std,4.349243,10.78503,75.978631,0.803019
min,-11.0,94.02,2.0,1.0
25%,-7.91,113.49,10.0,3.0
50%,-2.91,121.15,15.0,3.5
75%,0.13,126.88,52.0,4.1
max,6.0,142.0,750.0,7.9


### Filter Code

In [42]:
new_df = df[df["mag"] > 6.0].reset_index(drop=True)
new_df.head(5)

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark
0,2008/11/07,16:04:27.451,-6.76,129.37,39,6.2,Banda Sea
1,2008/11/22,16:11:54.420,-4.17,101.43,10,6.2,Southern Sumatra
2,2008/11/22,16:01:00.895,-4.46,101.13,5,6.4,Southern Sumatra
3,2008/11/25,14:12:08.517,-3.77,113.26,10,6.4,Borneo
4,2008/12/06,10:55:26.662,-7.56,124.74,404,6.3,Banda Sea


### Save Clean Dataframe

In [38]:
new_df.to_csv("./other_data/gempa_modified.csv", sep=",", index=False)