In [31]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from lxml import etree
import recordlinkage
import duckdb
import hashlib
from pathlib import Path
kagglehub.login

<function kagglehub.auth.login(validate_credentials: bool = True) -> None>

## **Loading Datasets**

We will be using two datasets found on Kaggle for this course project:
Global Earthquake-Tsunami Risk Assessment Dataset by Ahmed Mohamed Zaki
Climate Change: Earth Surface Temperature Data by Berkeley Earth and Kristen Sissener

In [33]:
# Download the datasets using methods specified on Kaggle
## Climate Dataset

path_climate = kagglehub.dataset_download("berkeleyearth/climate-change-earth-surface-temperature-data")

print("Path to dataset files:", path_climate)


df_climate = pd.read_csv(path_climate + "/GlobalLandTemperaturesByMajorCity.csv")

df_climate.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/berkeleyearth/climate-change-earth-surface-temperature-data?dataset_version_number=2...


100%|██████████| 84.7M/84.7M [00:03<00:00, 26.6MB/s]

Extracting files...





Path to dataset files: /Users/georgew/.cache/kagglehub/datasets/berkeleyearth/climate-change-earth-surface-temperature-data/versions/2


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1849-01-01,26.704,1.435,Abidjan,Côte D'Ivoire,5.63N,3.23W
1,1849-02-01,27.434,1.362,Abidjan,Côte D'Ivoire,5.63N,3.23W
2,1849-03-01,28.101,1.612,Abidjan,Côte D'Ivoire,5.63N,3.23W
3,1849-04-01,26.14,1.387,Abidjan,Côte D'Ivoire,5.63N,3.23W
4,1849-05-01,25.427,1.2,Abidjan,Côte D'Ivoire,5.63N,3.23W


In [None]:
## Earthquake Dataset
path_earthquake = kagglehub.dataset_download("ahmeduzaki/global-earthquake-tsunami-risk-assessment-dataset")

print("Path to dataset files:", path_earthquake)


df_earthquake = pd.read_csv(path_earthquake + "/earthquake_data_tsunami.csv")

df_earthquake.head()

Path to dataset files: /Users/georgew/.cache/kagglehub/datasets/ahmeduzaki/global-earthquake-tsunami-risk-assessment-dataset/versions/1


Unnamed: 0,magnitude,cdi,mmi,sig,nst,dmin,gap,depth,latitude,longitude,Year,Month,tsunami
0,7.0,8,7,768,117,0.509,17.0,14.0,-9.7963,159.596,2022,11,1
1,6.9,4,4,735,99,2.229,34.0,25.0,-4.9559,100.738,2022,11,0
2,7.0,3,3,755,147,3.125,18.0,579.0,-20.0508,-178.346,2022,11,1
3,7.3,5,5,833,149,1.865,21.0,37.0,-19.2918,-172.129,2022,11,1
4,6.6,0,2,670,131,4.998,27.0,624.464,-25.5948,178.278,2022,11,1


From the demonstration of two datasets, we found that one common element that appear in both datasets is latitude & longitude. However, the format are slightly different in the two, as Climate dataset use directions, while Earthquake dataset use positive & negative numbers. In order to synthesize the dataset based on latitude and longitude, we need to uniform the format using *openRefine*.

## **Checking Hash Values**

In [35]:
# Function help caculate SHA Hash values

def sha256sum(file_path):

    h = hashlib.sha256()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            h.update(chunk)
    return h.hexdigest()

# Path of two datasets
climate_path = path_climate
earthquake_path = path_earthquake

# Calculate hashes
climate_hash = sha256sum(path_climate + "/GlobalLandTemperaturesByMajorCity.csv")
earthquake_hash = sha256sum(path_earthquake + "/earthquake_data_tsunami.csv")

print("SHA-256 Hashes:")
print(f"Climate dataset:    {climate_hash}")
print(f"Earthquake dataset: {earthquake_hash}")

SHA-256 Hashes:
Climate dataset:    11cb36d674d44d9286fb6924d2d0e3f7c19313ab8acdb76d3a55a8be5ddc0b0f
Earthquake dataset: 9cc6cbc0445f1b14d036432a978a97c3c89930c0a0023655e8f262e5eb3ae23d


## **Cleaned Data Inspection**

* First, we removed rows in Climate dataset with record year earlier than 2000, since those data cannot match any records in the earthquake dataset and therefore could not be used when analyzing pattern and would make merging messy.

* Second, we adjust the schema of latitude and longitude in Climate dataset, as they are currently in a format of (10N, 10W). We need to uniform the two datasets therefore we are changing its format to (10, -10), using positive to represent (North, East) and negative to represent (South, West).

* Lastly, we join the latitude and longitude columns in the two datasets for easier comparison and analysis.

In [36]:
df_climate_cleaned = pd.read_csv("GlobalLandTemperaturesByMajorCity-cleaned.csv")
df_climate_cleaned.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,LatitudeAndLongitude
0,1993-11-01T00:00:00Z,27.21,0.245,Abidjan,"5.63,-3.23"
1,1993-12-01T00:00:00Z,26.996,0.277,Abidjan,"5.63,-3.23"
2,1994-01-01T00:00:00Z,27.014,0.295,Abidjan,"5.63,-3.23"
3,1994-02-01T00:00:00Z,28.687,0.466,Abidjan,"5.63,-3.23"
4,1994-03-01T00:00:00Z,28.571,0.214,Abidjan,"5.63,-3.23"


In [38]:
df_earthquake_cleaned = pd.read_csv("earthquake-data-tsunami-cleaned.csv")
df_earthquake_cleaned.head()

Unnamed: 0,magnitude,cdi,mmi,sig,nst,dmin,gap,depth,dt,tsunami,LatitudeAndLongitude
0,7.0,8,7,768,117,0.509,17.0,14.0,2022-11,1,"-9.7963,159.596"
1,6.9,4,4,735,99,2.229,34.0,25.0,2022-11,0,"-4.9559,100.738"
2,7.0,3,3,755,147,3.125,18.0,579.0,2022-11,1,"-20.0508,-178.346"
3,7.3,5,5,833,149,1.865,21.0,37.0,2022-11,1,"-19.2918,-172.129"
4,6.6,0,2,670,131,4.998,27.0,624.464,2022-11,1,"-25.5948,178.278"


## **Dataset Merge**