### Import Required Libraries and Set Up Environment Variables

In [30]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [31]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/LTVg5VoACPM7ySf79Ht7BVELqikDF2F567fAzu6z"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
CME_url = ('https://api.nasa.gov/DONKI/CME?startDate=yyyy-MM-dd&endDate=yyyy-MM-dd&api_key=DEMO_KEY')
#CME_url = base_url + '?Format' + 'CME'
#CME_url
#print(json.dumps(CME_url, indent=4))

In [32]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(CME_url)
cme_response


<Response [200]>

In [33]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()
cme_json

[{'activityID': '2024-09-21T11:12:00-CME-001',
  'catalog': 'M2M_CATALOG',
  'startTime': '2024-09-21T11:12Z',
  'instruments': [{'displayName': 'SOHO: LASCO/C2'},
   {'displayName': 'SOHO: LASCO/C3'},
   {'displayName': 'STEREO A: SECCHI/COR2'}],
  'sourceLocation': '',
  'activeRegionNum': None,
  'note': 'Thin diffuse CME seen to the NW in all coronagraphs. The source is best observed in SDO AIA 171 as field line movement on or just beyond the NW limb starting around 2024-09-21T10:39Z.',
  'submissionTime': '2024-09-21T20:17Z',
  'versionId': 1,
  'link': 'https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/33526/-1',
  'cmeAnalyses': [{'isMostAccurate': True,
    'time21_5': '2024-09-21T17:07Z',
    'latitude': 29.0,
    'longitude': 99.0,
    'halfAngle': 10.0,
    'speed': 651.0,
    'type': 'C',
    'featureCode': 'LE',
    'imageType': 'running difference',
    'measurementTechnique': 'SWPC_CAT',
    'note': 'Triangulated measurement between SOHO LASCO C2/C3 and STEREO COR2A, wi

In [34]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json, indent=4))

[
    {
        "activityID": "2024-09-21T11:12:00-CME-001",
        "catalog": "M2M_CATALOG",
        "startTime": "2024-09-21T11:12Z",
        "instruments": [
            {
                "displayName": "SOHO: LASCO/C2"
            },
            {
                "displayName": "SOHO: LASCO/C3"
            },
            {
                "displayName": "STEREO A: SECCHI/COR2"
            }
        ],
        "sourceLocation": "",
        "activeRegionNum": null,
        "note": "Thin diffuse CME seen to the NW in all coronagraphs. The source is best observed in SDO AIA 171 as field line movement on or just beyond the NW limb starting around 2024-09-21T10:39Z.",
        "submissionTime": "2024-09-21T20:17Z",
        "versionId": 1,
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/33526/-1",
        "cmeAnalyses": [
            {
                "isMostAccurate": true,
                "time21_5": "2024-09-21T17:07Z",
                "latitude": 29.0,
            

In [35]:
# Convert cme_json to a Pandas DataFrame 
cme_df = pd.json_normalize(cme_json)
cme_df

# Keep only the columns: activityID, startTime, linkedEvents
cme_sorted = cme_df[['activityID', 'startTime', 'linkedEvents']]
cme_sorted

Unnamed: 0,activityID,startTime,linkedEvents
0,2024-09-21T11:12:00-CME-001,2024-09-21T11:12Z,
1,2024-09-21T16:12:00-CME-001,2024-09-21T16:12Z,
2,2024-09-21T20:00:00-CME-001,2024-09-21T20:00Z,
3,2024-09-22T07:12:00-CME-001,2024-09-22T07:12Z,
4,2024-09-22T14:48:00-CME-001,2024-09-22T14:48Z,
...,...,...,...
112,2024-10-19T07:12:00-CME-001,2024-10-19T07:12Z,[{'activityID': '2024-10-19T06:48:00-FLR-001'}]
113,2024-10-19T09:24:00-CME-001,2024-10-19T09:24Z,
114,2024-10-19T09:48:00-CME-001,2024-10-19T09:48Z,
115,2024-10-19T19:00:00-CME-001,2024-10-19T19:00Z,


In [36]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_cleaned = cme_sorted[cme_sorted['linkedEvents'].notna()]
#cme_cleaned = cme_sorted.dropna
cme_cleaned.head(30)

Unnamed: 0,activityID,startTime,linkedEvents
6,2024-09-22T21:36:00-CME-001,2024-09-22T21:36Z,[{'activityID': '2024-09-22T21:12:00-FLR-001'}]
9,2024-09-23T20:48:00-CME-001,2024-09-23T20:48Z,[{'activityID': '2024-09-28T07:00:00-IPS-001'}]
15,2024-09-26T20:00:00-CME-001,2024-09-26T20:00Z,[{'activityID': '2024-09-26T19:41:00-FLR-001'}]
20,2024-09-29T06:00:00-CME-001,2024-09-29T06:00Z,[{'activityID': '2024-09-29T05:26:00-FLR-001'}]
30,2024-10-01T01:09:00-CME-001,2024-10-01T01:09Z,[{'activityID': '2024-09-30T23:37:00-FLR-001'}]
34,2024-10-01T23:09:00-CME-001,2024-10-01T23:09Z,[{'activityID': '2024-10-01T21:58:00-FLR-001'}]
35,2024-10-02T06:36:00-CME-001,2024-10-02T06:36Z,[{'activityID': '2024-10-02T05:30:00-FLR-001'}]
36,2024-10-02T14:24:00-CME-001,2024-10-02T14:24Z,[{'activityID': '2024-10-02T13:22:00-FLR-001'}]
40,2024-10-03T12:48:00-CME-001,2024-10-03T12:48Z,[{'activityID': '2024-10-03T12:08:00-FLR-001'}...
43,2024-10-03T20:36:00-CME-001,2024-10-03T20:36Z,[{'activityID': '2024-10-03T20:09:00-FLR-001'}...


In [37]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
for i in range(len(cme_cleaned)):
    #print(cme_cleaned.iloc[i])
    activity_id = cme_cleaned.iloc[i]['activityID']
    start_time = cme_cleaned.iloc[i]['startTime']
    linked_events = cme_cleaned.iloc[i]['linkedEvents']
# and then iterates over the values in 'linkedEvents' 
linkedEvents_value = cme_cleaned.iloc[3]['activityID']
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
if isinstance(linked_events, list):
        for event in linked_events:
            expanded_rows.append({
                'activityID': activity_id,
                'startTime': start_time,
                'linkedEvent': event
            })
    # Iterate over each dictionary in the list
    
        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        #activity_id.append(cme_response: "")
        #start_time.append(cme_response: "")

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)
expanded_df.head(10)

Unnamed: 0,activityID,startTime,linkedEvent
0,2024-10-19T07:12:00-CME-001,2024-10-19T07:12Z,{'activityID': '2024-10-19T06:48:00-FLR-001'}


In [38]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors

        # Log the error or print it for debugging




In [39]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:


In [40]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:


In [41]:
# print out the datatype of each column in this DataFrame:


In [42]:
# Convert the 'GST_ActivityID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_CME and activityID to cmeID

# Drop linkedEvents

# Verify that all steps were executed correctly


In [43]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  


### GST Data

In [44]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST


In [45]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response


In [46]:
# Convert the response variable to json and store it as a variable named gst_json

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data


In [47]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: activityID, startTime, linkedEvents


In [48]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME


In [49]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.


In [50]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:


In [51]:
# Convert the 'CME_ActivityID' column to string format 

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


In [52]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [53]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [54]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [55]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [56]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [57]:
# Export data to CSV without the index
