In [6]:
import requests
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import pandas as pd

# Insert EONET data from NASA API to MongoDB

In [3]:
uri = "your_uri"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Define the EONET API endpoint
eonet_url = 'https://eonet.gsfc.nasa.gov/api/v3/events'

database = client["Nasa_EONET"]
collection = database["natural_events"]

# Make a request to the EONET API
response = requests.get(eonet_url)

if response.status_code == 200:
    data = response.json()
    # Insert the retrieved data into MongoDB
    collection.insert_many(data['events'])
    print('Data inserted into MongoDB successfully.')
else:
    print('Failed to retrieve data from EONET API.')

Data inserted into MongoDB successfully.


# Connect to Mongo database

In [7]:
uri = "your_uri"

client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [9]:
database = client["Nasa_EONET"]
collection = database["natural_events"]

In [10]:
database

Database(MongoClient(host=['ac-otesk9b-shard-00-01.c4dp6ex.mongodb.net:27017', 'ac-otesk9b-shard-00-00.c4dp6ex.mongodb.net:27017', 'ac-otesk9b-shard-00-02.c4dp6ex.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-11h46f-shard-0', tls=True, server_api=<pymongo.server_api.ServerApi object at 0x000001E27FBDB5E0>), 'Nasa_EONET')

# Fetch data from MongoDB using PySpark

In [3]:
# For pyspark running in cmd:
# pyspark --packages org.mongodb.spark:mongo-spark-connector_2.12:3.0.1

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import split

In [11]:
spark = SparkSession.builder \
    .appName("MongoDBSparkExample") \
    .config("spark.mongodb.input.uri", uri) \
    .config("spark.mongodb.input.database", database) \
    .config("spark.mongodb.input.collection", collection) \
    .getOrCreate()

In [12]:
spark

In [40]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
    .option("spark.mongodb.input.uri", uri) \
    .option("spark.mongodb.input.database", "Nasa_EONET") \
    .option("spark.mongodb.input.collection", "natural_events").load()

In [None]:
client.close()

In [41]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |-- closed: void (nullable = true)
 |-- description: string (nullable = true)
 |-- geometry: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- magnitudeValue: double (nullable = true)
 |    |    |-- magnitudeUnit: string (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- coordinates: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |-- id: string (nullable = true)
 |-- link: string (nullable = true)
 |-- sources: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- title: stri

# Data manipulation using PySpark

In [42]:
# explode nested columns
df = df.withColumn('categories', f.explode('categories'))\
    .withColumn('Category ID', f.col('categories').getItem('id'))\
    .withColumn('Category Title', f.col('categories').getItem('title'))

In [43]:
df = df.withColumn('geometry', f.explode('geometry'))\
    .withColumn('Magnitude Value', f.col('geometry').getItem('magnitudeValue'))\
    .withColumn('Magnitude Unit', f.col('geometry').getItem('magnitudeUnit'))\
    .withColumn('Geometry Date', f.col('geometry').getItem('date'))\
    .withColumn('Geometry Type', f.col('geometry').getItem('type'))\
    .withColumn('Geometry Coordinates', f.col('geometry').getItem('coordinates'))

In [44]:
df = df.withColumn('sources', f.explode('sources'))\
    .withColumn('Source ID', f.col('sources').getItem('id'))\
    .withColumn('Source URL', f.col('sources').getItem('url'))

In [46]:
# remove unnecessary tables
df = df.drop('categories', 'geometry', 'sources', 'description', 'closed')

In [47]:
# split data into two columns
df = df.withColumn('Date', split(df['Geometry Date'], 'T').getItem(0)) \
       .withColumn('Time', split(df['Geometry Date'], 'T').getItem(1)) 

In [48]:
df = df.withColumn('Longitude', df['Geometry Coordinates'][0]) \
        .withColumn('Latitude', df['Geometry Coordinates'][1])

In [50]:
# remove unnecessary tables
df = df.drop('Geometry Date', 'Geometry Coordinates')

In [52]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- id: string (nullable = true)
 |-- link: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Category ID: string (nullable = true)
 |-- Category Title: string (nullable = true)
 |-- Magnitude Value: double (nullable = true)
 |-- Magnitude Unit: string (nullable = true)
 |-- Geometry Type: string (nullable = true)
 |-- Source ID: string (nullable = true)
 |-- Source URL: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)



In [53]:
# convert PySpark DF to Pandas DF
eonet_df = df.toPandas()

In [54]:
eonet_df.head()

Unnamed: 0,_id,id,link,title,Category ID,Category Title,Magnitude Value,Magnitude Unit,Geometry Type,Source ID,Source URL,Date,Time,Longitude,Latitude
0,"(6506a0255457d4ab94009af1,)",EONET_6425,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,"Kilauea Volcano, Hawaii",volcanoes,Volcanoes,,,Point,SIVolcano,https://volcano.si.edu/volcano.cfm?vn=332010,2023-09-10,00:00:00Z,-155.287,19.421
1,"(6506a0255457d4ab94009af2,)",EONET_6423,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,Iceberg D33A,seaLakeIce,Sea and Lake Ice,340.0,NM^2,Point,NATICE,https://usicecenter.gov/pub/Iceberg_Tabular.csv,2023-09-08,00:00:00Z,17.14,-69.38
2,"(6506a0255457d4ab94009af2,)",EONET_6423,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,Iceberg D33A,seaLakeIce,Sea and Lake Ice,340.0,NM^2,Point,NATICE,https://usicecenter.gov/pub/Iceberg_Tabular.csv,2023-09-15,00:00:00Z,14.37,-69.23
3,"(6506a0255457d4ab94009af3,)",EONET_6424,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,Iceberg D33B,seaLakeIce,Sea and Lake Ice,252.0,NM^2,Point,NATICE,https://usicecenter.gov/pub/Iceberg_Tabular.csv,2023-09-08,00:00:00Z,18.3,-69.64
4,"(6506a0255457d4ab94009af3,)",EONET_6424,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,Iceberg D33B,seaLakeIce,Sea and Lake Ice,252.0,NM^2,Point,NATICE,https://usicecenter.gov/pub/Iceberg_Tabular.csv,2023-09-15,00:00:00Z,16.6,-69.45


In [56]:
spark.stop()

In [57]:
eonet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1886 entries, 0 to 1885
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   _id              1886 non-null   object 
 1   id               1886 non-null   object 
 2   link             1886 non-null   object 
 3   title            1886 non-null   object 
 4   Category ID      1886 non-null   object 
 5   Category Title   1886 non-null   object 
 6   Magnitude Value  1419 non-null   float64
 7   Magnitude Unit   1419 non-null   object 
 8   Geometry Type    1886 non-null   object 
 9   Source ID        1886 non-null   object 
 10  Source URL       1886 non-null   object 
 11  Date             1886 non-null   object 
 12  Time             1886 non-null   object 
 13  Longitude        1886 non-null   float64
 14  Latitude         1886 non-null   float64
dtypes: float64(3), object(12)
memory usage: 221.1+ KB


In [58]:
eonet_df.to_csv('eonet_data.csv', index=False)