# Webscraping Earthquake Data

In [4]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Define starttime and endtime
starttime = datetime.now() - timedelta(days=30)  # 30 days ago
endtime = datetime.now()  # Current time

params = {
    "format": "geojson",
    "starttime": starttime.strftime("%Y-%m-%d"),
    "endtime": endtime.strftime("%Y-%m-%d"),    
    "minmagnitude": 4,  
    "maxlatitude": 72.0, 
    "minlatitude": 35.0,  
    "maxlongitude": 40.0,  
    "minlongitude": -25.0
}

# Request data from USGS API
url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
response = requests.get(url, params=params)
data = response.json()

earthquakes = []
for quake in data["features"]:
    properties = quake["properties"]
    geometry = quake["geometry"]["coordinates"]

    earthquakes.append({
        "Time": pd.to_datetime(properties["time"], unit='ms'),  # Convert timestamp
        "Magnitude": properties["mag"],
        "Location": properties["place"],
        "Longitude": geometry[0],
        "Latitude": geometry[1],
        "Depth (km)": geometry[2]
    })

df = pd.DataFrame(earthquakes)

print(df.head())


                     Time  Magnitude                        Location  \
0 2025-05-01 14:43:44.389        4.1        21 km N of Serik, Turkey   
1 2025-05-01 10:23:53.699        4.2  9 km S of Ouranoupolis, Greece   
2 2025-04-30 23:48:10.940        4.1      17 km SE of Bodrum, Turkey   
3 2025-04-30 05:36:23.985        4.5        33 km WNW of Fry, Greece   
4 2025-04-27 18:09:38.109        4.7   11 km WNW of Doğanyol, Turkey   

   Longitude  Latitude  Depth (km)  
0    31.0858   37.1107     109.477  
1    23.9826   40.2494      10.000  
2    27.5581   36.9203      12.435  
3    26.5653   35.4778      10.000  
4    38.9278   38.3634      12.188  


In [5]:
import sys
import os

# Add project root to path
project_path = os.path.abspath("..")
sys.path.append(project_path)

# Ensure data folder exists
os.makedirs("data", exist_ok=True)

# Import your project functions
from src import (
    fetch_earthquake_data,
    clean_earthquake_data,
    perform_analysis,
    map_epicenters,
)

# Run the pipeline
fetch_earthquake_data("2024-03-01", "2024-03-31")
df = clean_earthquake_data()
print(f"✅ Cleaned {len(df)} earthquakes.")
df.head()





✅ Cleaned 488 earthquakes.


Unnamed: 0,time,place,mag,depth,longitude,latitude,type,id,url,region
0,2024-03-30 22:55:52.655,"77 km W of Panguna, Papua New Guinea",5.1,77.27,154.7807,-6.3333,earthquake,us7000m900,https://earthquake.usgs.gov/earthquakes/eventp...,"Panguna, Papua New Guinea"
1,2024-03-30 22:07:30.357,"165 km NNW of Houma, Tonga",4.8,10.0,-175.9989,-19.8295,earthquake,us7000ma6u,https://earthquake.usgs.gov/earthquakes/eventp...,"Houma, Tonga"
2,2024-03-30 21:32:36.907,"186 km NNW of Houma, Tonga",4.5,10.0,-176.178,-19.706,earthquake,us7000ma6q,https://earthquake.usgs.gov/earthquakes/eventp...,"Houma, Tonga"
3,2024-03-30 20:55:08.465,"46 km W of Port-Vila, Vanuatu",4.7,10.0,167.8799,-17.8037,earthquake,us7000ma6p,https://earthquake.usgs.gov/earthquakes/eventp...,"Port-Vila, Vanuatu"
4,2024-03-30 20:04:08.351,Kermadec Islands region,4.6,9.606,-177.8442,-27.8943,earthquake,us7000ma6x,https://earthquake.usgs.gov/earthquakes/eventp...,Kermadec Islands region


In [6]:
import sys
import os

# Dynamically find the project root and add it to the path
project_path = os.path.abspath("..")  # Adjust if your notebook is deeper
sys.path.append(project_path)

# ✅ Ensure data and output folders exist
os.makedirs("data", exist_ok=True)
os.makedirs("seismic-insight/outputs", exist_ok=True)

# ✅ Import functions from src
from src import (
    fetch_earthquake_data,
    clean_earthquake_data,
    perform_analysis,
    map_epicenters,
)

# ✅ Fetch and clean data
fetch_earthquake_data("2024-03-01", "2024-03-31")
df = clean_earthquake_data()
print(f"✅ Cleaned {len(df)} earthquakes.")

# ✅ Preview data
print(df.head())

# ✅ Rename columns for compatibility
df.rename(columns={
    "Magnitude": "mag",
    "Depth (km)": "depth",
    "Location": "place",
    "Latitude": "latitude",
    "Longitude": "longitude"
}, inplace=True)




✅ Cleaned 488 earthquakes.
                     time                                 place  mag   depth  \
0 2024-03-30 22:55:52.655  77 km W of Panguna, Papua New Guinea  5.1  77.270   
1 2024-03-30 22:07:30.357            165 km NNW of Houma, Tonga  4.8  10.000   
2 2024-03-30 21:32:36.907            186 km NNW of Houma, Tonga  4.5  10.000   
3 2024-03-30 20:55:08.465         46 km W of Port-Vila, Vanuatu  4.7  10.000   
4 2024-03-30 20:04:08.351               Kermadec Islands region  4.6   9.606   

   longitude  latitude        type          id  \
0   154.7807   -6.3333  earthquake  us7000m900   
1  -175.9989  -19.8295  earthquake  us7000ma6u   
2  -176.1780  -19.7060  earthquake  us7000ma6q   
3   167.8799  -17.8037  earthquake  us7000ma6p   
4  -177.8442  -27.8943  earthquake  us7000ma6x   

                                                 url  \
0  https://earthquake.usgs.gov/earthquakes/eventp...   
1  https://earthquake.usgs.gov/earthquakes/eventp...   
2  https://earthquake.u

In [7]:
import os
import sys

# Setup project path
project_path = os.path.abspath("..")
sys.path.append(project_path)

# Now import your functions
from src import fetch_earthquake_data, clean_earthquake_data

# Fetch and clean the data
fetch_earthquake_data("2024-03-01", "2024-03-31")
df = clean_earthquake_data()

# Rename columns to match analysis
df.rename(columns={
    "Magnitude": "mag",
    "Depth": "depth",
    "Location": "place",
    "Latitude": "latitude",
    "Longitude": "longitude"
}, inplace=True)

# Confirm columns
print("✅ Renamed columns:", df.columns.tolist())





✅ Renamed columns: ['time', 'place', 'mag', 'depth', 'longitude', 'latitude', 'type', 'id', 'url', 'region']


In [8]:
import matplotlib.pyplot as plt

def plot_magnitude_histogram(df):
    plt.figure(figsize=(8, 4))
    plt.hist(df["mag"], bins=10, edgecolor='black')
    plt.title("Distribution of Earthquake Magnitudes")
    plt.xlabel("Magnitude")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.tight_layout()
    plt.show()  # 👈 Important!



In [9]:
import seaborn as sns

def plot_top_regions(df):
    top_regions = df["region"].value_counts().head(5)
    plt.figure(figsize=(8, 4))
    sns.barplot(x=top_regions.values, y=top_regions.index)
    plt.title("Top 5 Earthquake Regions")
    plt.xlabel("Count")
    plt.ylabel("Region")
    plt.show()


In [10]:
def plot_magnitude_by_region(df):
    top = df[df["region"].isin(df["region"].value_counts().index[:5])]
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=top, x="region", y="mag")
    plt.title("Magnitude Distribution by Region")
    plt.ylabel("Magnitude")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


# Statistical Analysis

In [11]:
# ✅ Check original column names
print("BEFORE RENAME:", df.columns.tolist())

# ✅ Rename columns first
df.rename(columns={
    "Magnitude": "mag",
    "Depth (km)": "depth",
    "Location": "place",
    "Latitude": "latitude",
    "Longitude": "longitude"
}, inplace=True)

print("AFTER RENAME:", df.columns.tolist())

# ✅ Now do analysis
from scipy.stats import pearsonr, ttest_ind

# Correlation analysis between magnitude and depth
correlation, p_value = pearsonr(df['mag'], df['depth'])
print(f'📈 Correlation: {correlation:.3f}, P-value: {p_value:.3f}')

# Statistical test between California and Alaska
region1 = df[df['place'].str.contains('California', case=False, na=False)]['mag']
region2 = df[df['place'].str.contains('Alaska', case=False, na=False)]['mag']

if not region1.empty and not region2.empty:
    t_stat, p_val = ttest_ind(region1, region2, equal_var=False)
    print(f'📊 T-test: T-statistic = {t_stat:.3f}, P-value = {p_val:.3f}')
else:
    print("⚠️ Not enough data for one or both regions.")



BEFORE RENAME: ['time', 'place', 'mag', 'depth', 'longitude', 'latitude', 'type', 'id', 'url', 'region']
AFTER RENAME: ['time', 'place', 'mag', 'depth', 'longitude', 'latitude', 'type', 'id', 'url', 'region']
📈 Correlation: -0.009, P-value: 0.850
⚠️ Not enough data for one or both regions.


# Database Integration

In [12]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('../data/database.db')
cursor = conn.cursor()

# Create a table for earthquake data
cursor.execute('''CREATE TABLE IF NOT EXISTS earthquakes (
    Time TEXT, Magnitude REAL, Location TEXT, Longitude REAL, Latitude REAL, Depth REAL)''')

# Rename DataFrame columns to match the SQLite table schema
df.rename(columns={
    "time": "Time",
    "mag": "Magnitude",
    "place": "Location",
    "longitude": "Longitude",
    "latitude": "Latitude",
    "depth": "Depth"
}, inplace=True)

# Insert data into the table
df.to_sql('earthquakes', conn, if_exists='replace', index=False)

# Query the database
cursor.execute('SELECT * FROM earthquakes WHERE Magnitude > 5')
results = cursor.fetchall()
print(results[:5])

conn.close()

[('2024-03-30 22:55:52.655000', '77 km W of Panguna, Papua New Guinea', 5.1, 77.27, 154.7807, -6.3333, 'earthquake', 'us7000m900', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000m900', 'Panguna, Papua New Guinea'), ('2024-03-30 17:35:49.670000', '164 km W of Pangai, Tonga', 5.2, 10.0, -175.9239, -19.7019, 'earthquake', 'us7000m8z0', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000m8z0', 'Pangai, Tonga'), ('2024-03-30 16:44:25.049000', '171 km NNW of Houma, Tonga', 5.3, 10.249, -176.0022, -19.7708, 'earthquake', 'us7000m8yl', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000m8yl', 'Houma, Tonga'), ('2024-03-30 16:28:57.456000', 'Rat Islands, Aleutian Islands, Alaska', 5.8, 109.0, 178.3963, 52.1143, 'earthquake', 'us7000m8yd', 'https://earthquake.usgs.gov/earthquakes/eventpage/us7000m8yd', 'Rat Islands, Aleutian Islands, Alaska'), ('2024-03-30 09:48:17.120000', 'central East Pacific Rise', 5.1, 10.0, -103.9175, -4.0264, 'earthquake', 'us7000m8ww', 'https://eart

# LLM Integration

In [13]:
# Import the summarize_data function
try:
    from src.llm_helper import summarize_data
    print('LLM helper imported successfully.')
except ImportError as e:
    print(f'Error importing summarize_data: {e}')

Error importing summarize_data: No module named 'src.llm_helper'


In [None]:
# Attempt to summarize data
try:
    summary = summarize_data(df)
    print(summary)
except Exception as e:
    print(f'Error during summarization: {e}')

Error during summarization: name 'summarize_data' is not defined


In [18]:
# ✅ Rename columns only once
df.rename(columns={
    "Magnitude": "mag",
    "Depth": "depth",
    "Location": "place",
    "Latitude": "latitude",
    "Longitude": "longitude"
}, inplace=True)

print("✅ Columns after rename:", df.columns.tolist())

# ✅ Ensure output directory exists
import os
os.makedirs("seismic-insight/outputs", exist_ok=True)

# ✅ Generate the map
from src.visualization import map_epicenters
map_epicenters(df, output_path="seismic-insight/outputs/world_map.html")

print("🌍 Map saved successfully!")


✅ Columns after rename: ['Time', 'place', 'mag', 'depth', 'longitude', 'latitude', 'type', 'id', 'url', 'region']
🌍 Map saved successfully!
