# Data cleaning

# Preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("../data/raw/datos-sismicos.xlsx")

## Correcting the type of values

In [3]:
for i in df.columns:
    print("- ", i, ": ", df[i].dtype)

-  fecha UTC :  str
-  hora UTC :  str
-  latitud (º) :  float64
-  longitud (º) :  float64
-  profundidad (km) :  int64
-  magnitud (M) :  float64


In [4]:
# "fecha UTC" should be date type, and "hora UTC" should be hour

fecha = df["fecha UTC"].str.strip()

hora = df["hora UTC"].str.strip().str.split(".", n=1).str[0]

df["fecha"] = pd.to_datetime(
                    fecha + " " + hora,
                    format= "%Y-%m-%d %H:%M:%S",
                    utc= True
)

df = df[["fecha", "latitud (º)", "longitud (º)", "profundidad (km)", "magnitud (M)"]]

## Adjusting the column names

In [5]:
df = df.rename(columns={
        'fecha': 'date', 
        'latitud (º)':'latitude',
        'longitud (º)': 'longitude',
        'profundidad (km)': 'depth',
        'magnitud (M)': 'magnitude'
        }
    )

## Turning the hour to Peruvian Timezone

In [6]:
df["date"] = df["date"].dt.tz_convert("America/Lima")

## First Export

In [7]:
df.to_csv("../data/processed/earthquake_clean.csv", index=False)

# Final Processing for Analysis

In [11]:
df = pd.read_csv("../data/processed/earthquake_clean.csv")

df["date"] = pd.to_datetime(df["date"], utc=True)
df["date"] = df["date"].dt.tz_convert("America/Lima")

## Date separation

In [12]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["hour"] = df["date"].dt.hour

## Location

In [13]:
import geopandas as gpd

deps = gpd.read_file("../data/raw/DEPARTAMENTOS_LIMITES")

In [14]:
gdf = gpd.GeoDataFrame(
    df,
    geometry = gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326"
)

In [15]:
deps = deps.to_crs(gdf.crs)

gdf = gpd.sjoin(gdf, deps, how="left", predicate="intersects")

In [16]:
df["department"] = gdf["DEPARTAMEN"]

df = df[["date", "depth", "magnitude", "year", "month", "day", "hour", "department", "longitude", "latitude"]]

In [17]:
# Eliminating the NaN values because they are outside the Peru limits

df = df.dropna(subset=["department"])

In [18]:
df.to_csv("../data/processed/earthquake_analysis_ready.csv", index=False)