# Data cleaning

# Preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("../data/raw/datos-sismicos.xlsx")

## Correcting the type of values

In [3]:
for i in df.columns:
    print("- ", i, ": ", df[i].dtype)

-  fecha UTC :  str
-  hora UTC :  str
-  latitud (º) :  float64
-  longitud (º) :  float64
-  profundidad (km) :  int64
-  magnitud (M) :  float64


In [68]:
# "fecha UTC" should be date type, and "hora UTC" should be hour

fecha = df["fecha UTC"].str.strip()

hora = df["hora UTC"].str.strip().str.split(".", n=1).str[0]

df["fecha"] = pd.to_datetime(
                    fecha + " " + hora,
                    format= "%Y-%m-%d %H:%M:%S",
                    utc= True
)


In [69]:
df["fecha"]

0       1960-01-13 15:40:34+00:00
1       1960-01-15 09:30:24+00:00
2       1960-01-17 02:57:58+00:00
3       1960-01-23 03:37:32+00:00
4       1960-01-30 05:07:24+00:00
                   ...           
25117   2026-02-05 05:30:03+00:00
25118   2026-02-05 11:07:38+00:00
25119   2026-02-05 18:20:45+00:00
25120   2026-02-05 20:56:34+00:00
25121   2026-02-05 21:32:23+00:00
Name: fecha, Length: 25122, dtype: datetime64[us, UTC]

In [70]:
df = df[["fecha", "latitud (º)", "longitud (º)", "profundidad (km)", "magnitud (M)"]]

## Adjusting the column names

In [72]:
df = df.rename(columns={
        'fecha': 'date', 
        'latitud (º)':'latitude',
        'longitud (º)': 'longitude',
        'profundidad (km)': 'depth',
        'magnitud (M)': 'magnitude'
        }
    )

## Turning the hour to Peruvian Timezone

In [78]:
df["date"] = df["date"].dt.tz_convert("America/Lima")

## First Export

In [None]:
df.to_csv("../data/processed/earthquake_clean.csv", index=False)

# Final Processing for Analysis

In [106]:
df = pd.read_csv("../data/processed/earthquake_clean.csv")

df["date"] = pd.to_datetime(df["date"], utc=True)

## Date separation

In [107]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour

## Location

In [108]:
import geopandas as gpd

deps = gpd.read_file("../data/raw/DEPARTAMENTOS")

In [109]:
deps.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [110]:
deps.columns

Index(['OBJECTID', 'CODDEP', 'DEPARTAMEN', 'CAPITAL', 'FUENTE', 'geometry'], dtype='str')

In [111]:
gdf = gpd.GeoDataFrame(
    df,
    geometry = gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326"
)

In [112]:
deps = deps.to_crs(gdf.crs)

In [113]:
gdf = gpd.sjoin(gdf, deps, how="left", predicate="intersects")

In [117]:
gdf.columns

Index(['Unnamed: 0', 'date', 'latitude', 'longitude', 'depth', 'magnitude',
       'year', 'month', 'day', 'hour', 'geometry', 'index_right', 'OBJECTID',
       'CODDEP', 'DEPARTAMEN', 'CAPITAL', 'FUENTE'],
      dtype='str')

In [127]:
gdf

Unnamed: 0.1,Unnamed: 0,date,latitude,longitude,depth,magnitude,year,month,day,hour,geometry,index_right,OBJECTID,CODDEP,DEPARTAMEN,CAPITAL,FUENTE
0,0,1960-01-13 15:40:34+00:00,-16.145,-72.144,60,7.5,1960,1,13,15,POINT (-72.144 -16.145),3.0,4.0,04,AREQUIPA,AREQUIPA,INEI
1,1,1960-01-15 09:30:24+00:00,-15.000,-75.000,70,7.0,1960,1,15,9,POINT (-75 -15),10.0,11.0,11,ICA,ICA,INEI
2,2,1960-01-17 02:57:58+00:00,-14.500,-74.500,150,6.4,1960,1,17,2,POINT (-74.5 -14.5),4.0,5.0,05,AYACUCHO,AYACUCHO,INEI
3,3,1960-01-23 03:37:32+00:00,-12.500,-68.500,300,5.8,1960,1,23,3,POINT (-68.5 -12.5),,,,,,
4,4,1960-01-30 05:07:24+00:00,-5.500,-77.500,100,5.7,1960,1,30,5,POINT (-77.5 -5.5),15.0,16.0,16,LORETO,IQUITOS,INEI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25117,25117,2026-02-05 05:30:03+00:00,-13.670,-76.110,51,3.8,2026,2,5,5,POINT (-76.11 -13.67),10.0,11.0,11,ICA,ICA,INEI
25118,25118,2026-02-05 11:07:38+00:00,-11.190,-74.660,130,4.0,2026,2,5,11,POINT (-74.66 -11.19),11.0,12.0,12,JUNIN,HUANCAYO,INEI
25119,25119,2026-02-05 18:20:45+00:00,-10.730,-73.900,117,4.0,2026,2,5,18,POINT (-73.9 -10.73),24.0,25.0,25,UCAYALI,PUCALLPA,INEI
25120,25120,2026-02-05 20:56:34+00:00,-15.200,-74.060,106,4.0,2026,2,5,20,POINT (-74.06 -15.2),4.0,5.0,05,AYACUCHO,AYACUCHO,INEI


In [118]:
df["department"] = gdf["DEPARTAMEN"]

In [123]:
df = df[["depth", "magnitude", "year", "month", "day", "hour", "department"]]

In [125]:
# Eliminating the NaN values because they are outside the Peru limits

df = df.dropna(subset=["department"])

In [126]:
df

Unnamed: 0,depth,magnitude,year,month,day,hour,department
0,60,7.5,1960,1,13,15,AREQUIPA
1,70,7.0,1960,1,15,9,ICA
2,150,6.4,1960,1,17,2,AYACUCHO
4,100,5.7,1960,1,30,5,LORETO
5,136,5.3,1960,2,8,19,UCAYALI
...,...,...,...,...,...,...,...
25117,51,3.8,2026,2,5,5,ICA
25118,130,4.0,2026,2,5,11,JUNIN
25119,117,4.0,2026,2,5,18,UCAYALI
25120,106,4.0,2026,2,5,20,AYACUCHO


In [129]:
df.to_csv("../data/processed/earthquake_analysis_ready.csv", index=False)