# Data cleaning

# Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("../data/raw/datos-sismicos.xlsx")

## Correcting the type of values

In [3]:
for i in df.columns:
    print("- ", i, ": ", df[i].dtype)

-  fecha UTC :  str
-  hora UTC :  str
-  latitud (º) :  float64
-  longitud (º) :  float64
-  profundidad (km) :  int64
-  magnitud (M) :  float64


In [4]:
# "fecha UTC" should be date type, and "hora UTC" should be hour

fecha = df["fecha UTC"].str.strip()

hora = df["hora UTC"].str.strip().str.split(".", n=1).str[0]

df["fecha"] = pd.to_datetime(
                    fecha + " " + hora,
                    format= "%Y-%m-%d %H:%M:%S",
                    utc= True
)

df = df[["fecha", "latitud (º)", "longitud (º)", "profundidad (km)", "magnitud (M)"]]

## Adjusting the column names

In [5]:
df = df.rename(columns={
        'fecha': 'date', 
        'latitud (º)':'latitude',
        'longitud (º)': 'longitude',
        'profundidad (km)': 'depth',
        'magnitud (M)': 'magnitude'
        }
    )

## Turning the hour to Peruvian Timezone

In [6]:
df["date"] = df["date"].dt.tz_convert("America/Lima")

## First Export

In [7]:
df.to_csv("../data/processed/earthquake_clean.csv", index=False)

# Final Processing for Analysis

In [8]:
df = pd.read_csv("../data/processed/earthquake_clean.csv")

df["date"] = pd.to_datetime(df["date"], utc=True)
df["date"] = df["date"].dt.tz_convert("America/Lima")

## Date separation

In [9]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour

## Location

* Departments

In [11]:
import geopandas as gpd

deps = gpd.read_file("../data/raw/DEPARTAMENTOS_LIMITES")

In [12]:
gdf = gpd.GeoDataFrame(
    df,
    geometry = gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326"
)

In [13]:
deps = deps.to_crs(gdf.crs)

gdf = gpd.sjoin(gdf, deps, how="left", predicate="intersects")

In [14]:
df["department"] = gdf["DEPARTAMEN"]

df = df[["date", "depth", "magnitude", "year", "month", "day", "hour", "department", "longitude", "latitude"]]

In [15]:
# Changing the nan for surroundings of the country, because those earthquakes impact on Peru too

df = df.fillna({"department": "surroundings"})

* Regions

In [16]:
gdf = gpd.GeoDataFrame(
    df,
    geometry = gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326"
)

reg = gpd.read_file("../data/raw/region-geografica")

In [17]:
print(reg.crs == gdf.crs)

False


In [18]:
reg = reg.to_crs(gdf.crs)

In [19]:
gdf = gpd.sjoin(gdf, reg, how="left", predicate="intersects")

* Ocean

In [20]:
oceans = gpd.read_file("../data/raw/World-Ocean-ShapeFile")

print(oceans.crs == gdf.crs)

True


In [21]:
oceans.head()

Unnamed: 0,featurecla,scalerank,min_zoom,geometry
0,Ocean,0,0,"MULTIPOLYGON (((59.91603 -67.40049, 59.67856 -..."


In [22]:
gdf = gdf.drop(columns=["index_right"], errors="ignore") #avoids conflict with the other join

gdf = gpd.sjoin(gdf, oceans, how="left", predicate="intersects")

In [23]:
integrating_ocean = gdf[gdf["nombre"].isna()]

In [24]:

gdf.loc[gdf["nombre"].isna(), "nombre"] = integrating_ocean["featurecla"]

In [25]:
df["region"] = gdf["nombre"]

In [26]:
df["region"] = df["region"].fillna("Bordering Country")

## Asigning categories

### Grouping by magnitude

In [27]:
bins = [-np.inf, 3, 4, 5, 6, 7, 8, np.inf]
labels = ["Micro","Minor","Light","Moderate","Strong","Major","Great"]

df["type_magnitude"] = pd.cut(
    df["magnitude"],
    bins = bins,
    labels = labels
)

print(df["type_magnitude"].value_counts())

type_magnitude
Light       18816
Moderate     3856
Minor        2214
Strong        208
Major          26
Micro           1
Great           1
Name: count, dtype: int64


### Grouping by time

In [28]:
# Grouping by quarter of the year

bins = [0, 3, 6, 9, 12]
labels = ["Q1", "Q2", "Q3", "Q4"]

df["quarter"] = pd.cut(
    df["month"],
    bins = bins,
    labels = labels,
)

print(df["quarter"].value_counts())

quarter
Q4    6712
Q2    6281
Q1    6094
Q3    6035
Name: count, dtype: int64


In [39]:
# Grouping by time of the day
bins = [0, 6, 12, 18, 24]
labels = ["Night", "Morning", "Afternoon", "Evening"]

df["time_of_day"] = pd.cut(
    df["hour"],
    bins = bins,
    labels = labels,
    right=False
)

print(df["time_of_day"].value_counts())

time_of_day
Night        7128
Evening      6604
Morning      5762
Afternoon    5628
Name: count, dtype: int64


In [40]:
# Grouping by day of the week
bins = [-1, 0, 1, 2, 3, 4, 5, 6]
labels = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

df["day_of_week"] = pd.cut(
    df["date"].dt.dayofweek,
    bins = bins,
    labels = labels,
)

print(df["day_of_week"].value_counts())

day_of_week
Sunday       3767
Saturday     3659
Wednesday    3641
Thursday     3588
Monday       3555
Friday       3501
Tuesday      3411
Name: count, dtype: int64


In [41]:
print(df["year"].max())
print(df["year"].min())

2026
1960


In [42]:
# Grouping by decade
bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020, np.inf]
labels = ["1960s", "1970s", "1980s", "1990s", "2000s", "2010s", "2020s"]

df["decade"] = pd.cut(
    df["year"],
    bins = bins,
    labels = labels,
    right=False
)

print(df["decade"].value_counts())


decade
2010s    7963
1990s    4596
2000s    4528
2020s    4054
1980s    1853
1970s    1126
1960s    1002
Name: count, dtype: int64


### Grouping by depth

In [43]:
print(df["depth"].max())
print(df["depth"].min())

743
5


In [44]:
bins = [0, 70, 300, np.inf]
labels = ["Shallow", "Intermediate", "Deep"]

df["type_depth"] = pd.cut(
    df["depth"], 
    bins = bins, 
    labels = labels
)

print(df["type_depth"].value_counts())

type_depth
Shallow         16174
Intermediate     8800
Deep              148
Name: count, dtype: int64


## Fixing readability

In [45]:
df["department"] = df["department"].str.title()

In [46]:
df["region"] = df["region"].replace({'Costa': 'Coastal Region', 'Sierra': 'Andean Highlands', 'Selva': 'Amazon Rainforest', 'Ocean': 'Pacific Ocean'})

In [47]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 25122 entries, 0 to 25121
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype                       
---  ------          --------------  -----                       
 0   date            25122 non-null  datetime64[us, America/Lima]
 1   depth           25122 non-null  int64                       
 2   magnitude       25122 non-null  float64                     
 3   year            25122 non-null  int32                       
 4   month           25122 non-null  int32                       
 5   day             25122 non-null  int32                       
 6   hour            25122 non-null  int32                       
 7   department      25122 non-null  str                         
 8   longitude       25122 non-null  float64                     
 9   latitude        25122 non-null  float64                     
 10  region          25122 non-null  str                         
 11  type_magnitude  25122 non-null  categor

In [48]:
df.to_csv("../data/processed/earthquake_analysis_ready.csv", index=False)