In [2]:
import sys
sys.path.append('../')

import pandas as pd
import sqlite3
from tqdm import tqdm
from src.cmesrc.config import CMESRCV2_DB, CMESRC_DB, MAJUMDAR_CATALOGUE

In [42]:
new_conn = sqlite3.connect(CMESRCV2_DB)
new_conn.execute("PRAGMA foreign_keys = ON")
new_cur = new_conn.cursor()

In [44]:
new_conn.executescript("""
-- HARPS: General information about each HARP region
CREATE TABLE HARPS (
  harpnum INTEGER PRIMARY KEY,                    -- Unique identifier for each HARP region
  start TEXT NOT NULL REFERENCES IMAGES(timestamp), -- Start timestamp of the HARP region
  end TEXT NOT NULL REFERENCES IMAGES(timestamp),  -- End timestamp of the HARP region
  pix_width INTEGER, -- Refers to the fixed width in pixel values used for cutouts
  pix_height INTEGER -- Refers to the fixed height in pixel values used for cutouts
);

-- CMES: Information about Coronal Mass Ejections
CREATE TABLE CMES (
  cme_id INTEGER NOT NULL PRIMARY KEY,            -- Unique identifier for each CME
  cme_date TEXT NOT NULL,                         -- Date and time of the CME
  cme_pa REAL,                                    -- Position angle of the CME
  cme_width REAL NOT NULL,                        -- Width of the CME
  cme_halo INTEGER,                               -- Indicator for Halo CMEs (1 for Halo, else NULL)
  cme_seen_in INTEGER NOT NULL,                   -- Where the CME was observed (e.g., C2, C3)
  cme_three_points INTEGER NOT NULL,              -- Number of observed points for the CME
  cme_quality INTEGER NOT NULL,                   -- Quality rating for the CME observation
  image_timestamp TEXT REFERENCES IMAGES (timestamp), -- Timestamp of the associated image
  CHECK (
    ((cme_pa IS NULL) = (cme_halo = 1))           -- Ensure Halo CMEs don't have a position angle
  )
);

-- FLARES: Information about solar flares
CREATE TABLE FLARES (
  flare_id INTEGER NOT NULL PRIMARY KEY,          -- Unique identifier for each flare
  harpnum INTEGER REFERENCES HARPS (harpnum),     -- Associated HARP region
  flare_date TEXT NOT NULL,                       -- Date and time of the flare
  flare_lon REAL,                                 -- Longitude of the flare
  flare_lat REAL,                                 -- Latitude of the flare
  flare_class_score REAL NOT NULL,                -- Score indicating the class of the flare
  flare_class TEXT NOT NULL,                      -- Class of the flare (e.g., M1, X2)
  flare_ar INTEGER,                               -- Active Region number of the flare
  flare_ar_source TEXT,                           -- Source of the Active Region number
  flare_verification TEXT,                        -- Verification status of the flare information
  image_timestamp TEXT REFERENCES IMAGES(timestamp), -- Timestamp of the associated image
  CHECK (
    (flare_ar IS NULL AND flare_lon IS NOT NULL AND flare_lat IS NOT NULL) OR
    (flare_ar IS NOT NULL)                        -- Ensure either flare_ar or both flare_lon and flare_lat are provided
  )
);

-- DIMMINGS: Information about solar dimmings
CREATE TABLE DIMMINGS (
  dimming_id INTEGER NOT NULL PRIMARY KEY,        -- Unique identifier for each dimming event
  harpnum INTEGER REFERENCES HARPS (harpnum),     -- Associated HARP region
  harps_dimming_dist REAL NOT NULL,               -- Distance of the dimming from the HARP region
  dimming_start_date TEXT NOT NULL,               -- Start date and time of the dimming
  dimming_peak_date TEXT NOT NULL,                -- Peak date and time of the dimming
  dimming_lon REAL NOT NULL,                      -- Longitude of the dimming event
  dimming_lat REAL NOT NULL,                      -- Latitude of the dimming event
  image_timestamp TEXT REFERENCES IMAGES(timestamp) -- Timestamp of the associated image
);

-- CMES_HARPS_SPATIALLY_CONSIST: Associations between CMEs and HARPNUMs based on spatial consistency
CREATE TABLE CMES_HARPS_SPATIALLY_CONSIST (
  harpnum INTEGER REFERENCES HARPS (harpnum),     -- Associated HARP region
  cme_id INTEGER REFERENCES CMES (cme_id),        -- Associated CME
  PRIMARY KEY (harpnum, cme_id)                   -- Unique pairing of HARP and CME
);

-- CMES_HARPS_EVENTS: Events linked to CMEs and HARPNUMs
CREATE TABLE CMES_HARPS_EVENTS (
  harpnum INTEGER,                                -- Associated HARP region
  cme_id INTEGER,                                 -- Associated CME
  flare_id INTEGER REFERENCES FLARES (flare_id),  -- Associated flare event
  flare_hours_diff INTEGER NOT NULL,              -- Time difference between CME and flare
  dimming_id INTEGER REFERENCES DIMMINGS (dimming_id), -- Associated dimming event
  dimming_hours_diff INTEGER NOT NULL,            -- Time difference between CME and dimming
  PRIMARY KEY (harpnum, cme_id),                  -- Unique pairing of HARP and CME
  FOREIGN KEY (harpnum, cme_id) REFERENCES CMES_HARPS_SPATIALLY_CONSIST (harpnum, cme_id) -- Foreign key reference to ensure valid associations
);

-- IMAGES: Metadata about images in the SDOML dataset
CREATE TABLE IMAGES (
  timestamp TEXT NOT NULL UNIQUE,                 -- Unique timestamp for each image
  year INTEGER NOT NULL,                          -- Year the image was captured
  month INTEGER NOT NULL,                         -- Month the image was captured
  day INTEGER NOT NULL,                           -- Day of the month the image was captured
  hour INTEGER NOT NULL,                          -- Hour of the day the image was captured
  minute INTEGER NOT NULL,                        -- Minute of the hour the image was captured
  second INTEGER NOT NULL,                        -- Second of the minute the image was captured
  idx INTEGER NOT NULL                            -- Index to access the image in the zarr format
);

-- harps_bbox: Contains information related to the HARPS bounding boxes and their relationships with various solar phenomena such as flares, dimmings, and CMEs.
CREATE TABLE HARPS_BBOX (
    -- Reference to the harpnum column in the harps table
    harpnum INTEGER REFERENCES HARPS (harpnum), 
    -- Reference to the timestamp column in the images table
    timestamp TEXT REFERENCES IMAGES (timestamp), 
    -- Minimum longitude of BBOX
    LONDTMIN REAL, 
    -- Maximum longitude of BBOX
    LONDTMAX REAL, 
    -- Minimum latitude of BBOX
    LATDTMIN REAL, 
    -- Maximum latitude of BBOX
    LATDTMAX REAL, 
    -- Is Rotated Bounding Box? Specifies if the BBOX is calculated using differential solar rotation because it was missing from the SHARP dataset
    IRBB INTEGER, 
    -- Reference to the flare_id column in the flares table. Represents the ID of the previous flare event.
    prev_flare_id INTEGER REFERENCES FLARES(flare_id), 
    -- Reference to the flare_id column in the flares table. Represents the ID of the subsequent flare event.
    next_flare_id INTEGER REFERENCES FLARES(flare_id), 
    -- Hours until the previous flare. Default value is -1.
    prev_flare_hours REAL NOT NULL DEFAULT -1, 
    -- Hours until the next flare. Default value is -1.
    next_flare_hours REAL NOT NULL DEFAULT -1, 
    -- Reference to the dimming_id column in the dimmings table. Represents the ID of the previous dimming event.
    prev_dimming_id INTEGER REFERENCES DIMMINGS(dimming_id), 
    -- Reference to the dimming_id column in the dimmings table. Represents the ID of the subsequent dimming event.
    next_dimming_id INTEGER REFERENCES DIMMINGS(dimming_id), 
    -- Hours until the previous dimming event. Default value is -1.
    prev_dimming_hours REAL NOT NULL DEFAULT -1, 
    -- Hours until the next dimming event. Default value is -1.
    next_dimming_hours REAL NOT NULL DEFAULT -1, 
    -- Reference to the cme_id column in the cmes table, denoting the previous CME for which the region was spatially consistent.
    prev_present_at_cme_id INTEGER REFERENCES CMES(cme_id), 
    -- Reference to the cme_id column in the cmes table, denoting the next CME for which the region will be spatially consistent.
    next_present_at_cme_id INTEGER REFERENCES CMES(cme_id), 
    -- Hours until the previous CME presence. Default value is -1.
    prev_present_at_cme_hours REAL NOT NULL DEFAULT -1, 
    -- Hours until the next CME presence. Default value is -1.
    next_present_at_cme_hours REAL NOT NULL DEFAULT -1, 
    -- Hours until the closest subsequent flare related temporally to a CME where the region was spatially consistent. Default value is -1.
    closest_flare_next_present_at_cme_hours INTEGER NOT NULL DEFAULT -1, 
    -- Reference to the flare_id column in the flares table, representing the closest flare related temporally to a CME where the region was spatially consistent.
    closest_flare_next_present_at_cme_id INTEGER REFERENCES FLARES(flare_id), 
    -- Hours until the closest subsequent dimming related temporally to a CME where the region was spatially consistent.
    closest_dimming_next_present_at_cme_hours INTEGER NOT NULL DEFAULT -1, 
    -- Reference to the dimming_id column in the dimmings table, representing the closest dimming related temporally to a CME where the region was spatially consistent.
    closest_dimming_next_present_at_cme_id INTEGER REFERENCES DIMMINGS(dimming_id),
    -- Unique key for the table based on harpnum and timestamp
    PRIMARY KEY (harpnum, timestamp)
);

-- harps_pixel_bbox: Contains information related to the pixel boundaries and center of the HARPS bounding boxes in SDOML images.
CREATE TABLE HARPS_PIXEL_BBOX (
    -- Reference to the harpnum column in the harps table
    harpnum INTEGER, 
    -- Reference to the timestamp column in the images table
    timestamp TEXT, 
    -- Minimum x-coordinate of the bounding box; value must be between 0 and 511
    x_min INTEGER NOT NULL CHECK (x_min >= 0 AND x_min <= 511), 
    -- Maximum x-coordinate of the bounding box; value must be between 0 and 511
    x_max INTEGER NOT NULL CHECK (x_max >= 0 AND x_max <= 511), 
    -- Minimum y-coordinate of the bounding box; value must be between 0 and 511
    y_min INTEGER NOT NULL CHECK (y_min >= 0 AND y_min <= 511), 
    -- Maximum y-coordinate of the bounding box; value must be between 0 and 511
    y_max INTEGER NOT NULL CHECK (y_max >= 0 AND y_max <= 511), 
    -- x-coordinate of the center of the bounding box; value must be between 0 and 511
    x_cen INTEGER NOT NULL CHECK (x_cen >= 0 AND x_cen <= 511), 
    -- y-coordinate of the center of the bounding box; value must be between 0 and 511
    y_cen INTEGER NOT NULL CHECK (y_cen >= 0 AND y_cen <= 511), 
    -- Unique key for the table based on harpnum and timestamp
    PRIMARY KEY (harpnum, timestamp),
    -- References the harps_bbox table to maintain a relationship with HARPS bounding box information
    FOREIGN KEY (harpnum, timestamp) REFERENCES HARPS_BBOX (harpnum, timestamp)
);

-- FINAL_CME_HARP_ASSOCIATIONS: Finalized associations between CMEs and HARPNUMs
CREATE TABLE FINAL_CME_HARP_ASSOCIATIONS (
  cme_id INTEGER UNIQUE NOT NULL,                 -- Unique identifier for each CME
  harpnum INTEGER NOT NULL,                       -- Associated HARP region
  association_method TEXT NOT NULL,               -- Method used to determine the association
  verification_score REAL,                        -- Confidence score of the association
  independent_verified INTEGER NOT NULL DEFAULT 0, -- Verification level for association (0 if only by me, 1 if verified by external)
  PRIMARY KEY (cme_id, harpnum)                   -- Unique pairing of CME and HARP
);
                       """)

<sqlite3.Cursor at 0x7fa343872b40>

In [45]:
new_conn.commit()
new_conn.close()

In [46]:
import sqlite3

def migrate_database(old_db_path, new_db_path):
    # Connect to the old and new databases
    old_conn = sqlite3.connect(old_db_path)
    new_conn = sqlite3.connect(new_db_path)

    old_cur = old_conn.cursor()
    new_cur = new_conn.cursor()

    # Migrate HARPS
    old_cur.execute("SELECT harpnum, start, end FROM harps")
    harps = old_cur.fetchall()
    new_cur.executemany("INSERT INTO HARPS (harpnum, start, end) VALUES (?, ?, ?)", harps)

    # Migrate CMES
    old_cur.execute("SELECT cme_id, cme_date, cme_pa, cme_width, cme_halo, cme_seen_in, cme_three_points, cme_quality, image_timestamp FROM cmes")
    cmes = old_cur.fetchall()
    new_cur.executemany("INSERT INTO CMES VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", cmes)

    # Migrate FLARES
    old_cur.execute("SELECT flare_id, HARPNUM, flare_date, flare_lon, flare_lat, flare_class_score, flare_class, flare_ar, flare_ar_source, flare_verification, image_timestamp FROM flares")
    flares = old_cur.fetchall()
    new_cur.executemany("INSERT INTO FLARES VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", flares)

    # Migrate DIMMINGS
    old_cur.execute("SELECT dimming_id, harpnum, harps_dimming_dist, dimming_start_date, dimming_peak_date, dimming_lon, dimming_lat, image_timestamp FROM dimmings")
    dimmings = old_cur.fetchall()
    new_cur.executemany("INSERT INTO DIMMINGS VALUES (?, ?, ?, ?, ?, ?, ?, ?)", dimmings)

    # Migrate CMES_HARPS_SPATIALLY_CONSIST (This is inferred from the old schema cmes_harpnums table)
    old_cur.execute("SELECT harpnum, cme_id FROM spat_consist_harpnums")
    cmes_harps = old_cur.fetchall()
    new_cur.executemany("INSERT INTO CMES_HARPS_SPATIALLY_CONSIST (harpnum, cme_id) VALUES (?, ?)", cmes_harps)

    # Migrate IMAGES
    old_cur.execute("SELECT timestamp, year, month, day, hour, minute, second, idx FROM images")
    images = old_cur.fetchall()
    new_cur.executemany("INSERT INTO IMAGES VALUES (?, ?, ?, ?, ?, ?, ?, ?)", images)

    # Migrate HARPS_BBOX
    old_cur.execute("SELECT harpnum, timestamp, LONDTMIN, LONDTMAX, LATDTMIN, LATDTMAX, IRBB, prev_flare_id, next_flare_id, prev_flare_hours, next_flare_hours, prev_dimming_id, next_dimming_id, prev_dimming_hours, next_dimming_hours, prev_present_at_cme_id, next_present_at_cme_id, prev_present_at_cme_hours, next_present_at_cme_hours, closest_flare_next_present_at_cme_hours, closest_flare_next_present_at_cme_id, closest_dimming_next_present_at_cme_hours, closest_dimming_next_present_at_cme_id FROM harps_bbox")
    harps_bbox_data = old_cur.fetchall()
    new_cur.executemany("INSERT INTO HARPS_BBOX VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", harps_bbox_data)

    # Migrate HARPS_PIXEL_BBOX
    old_cur.execute("SELECT harpnum, timestamp, x_min, x_max, y_min, y_max, x_cen, y_cen FROM harps_pixel_bbox")
    harps_pixel_bbox_data = old_cur.fetchall()
    new_cur.executemany("INSERT INTO HARPS_PIXEL_BBOX VALUES (?, ?, ?, ?, ?, ?, ?, ?)", harps_pixel_bbox_data)

    # Finalize migration by committing changes and closing connections
    new_conn.commit()
    old_conn.close()
    new_conn.close()

if __name__ == "__main__":
    migrate_database(CMESRC_DB, CMESRCV2_DB)

In [3]:
# Now finding the events

from datetime import datetime
from tqdm import tqdm
import bisect

conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()

association_threshold = 2

results = {}

harps = cur.execute("SELECT DISTINCT harpnum from HARPS").fetchall()

conn.execute("DELETE FROM CMES_HARPS_EVENTS")

for harp in tqdm(harps):
    harp = harp[0]

    flare_data = cur.execute("SELECT image_timestamp, flare_id, flare_class_score FROM FLARES WHERE harpnum = ? AND flare_verification != 'Non-verified'", (harp,)).fetchall()
    flare_timestamps, flare_ids, flare_class_scores = zip(*flare_data) if flare_data else ([], [], [])

    dimming_data = cur.execute("SELECT image_timestamp, dimming_id FROM DIMMINGS WHERE harpnum = ?", (harp,)).fetchall()
    dimming_timestamps, dimming_ids = zip(*dimming_data) if dimming_data else ([], [])

    present_at_cme_data = cur.execute("""
        SELECT c.image_timestamp, c.cme_id from CMES_HARPS_SPATIALLY_CONSIST as sch
        INNER JOIN CMES as c
        ON sch.cme_id = c.cme_id
        WHERE sch.harpnum = ? 
    """, (harp,)).fetchall()
    present_at_cme_timestamps, present_at_cme_ids = zip(*present_at_cme_data) if present_at_cme_data else ([], [])

    # Convert to datetime objects
    flare_timestamps = [datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in flare_timestamps]
    dimming_timestamps = [datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in dimming_timestamps]
    present_at_cme_timestamps = [datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in present_at_cme_timestamps]

    for cme_timestamp, cme_id in zip(present_at_cme_timestamps, present_at_cme_ids):

        # For Flares
        closest_flare_id = None
        closest_flare_hours_diff = -1
        if flare_timestamps:
            flare_index = bisect.bisect_right(flare_timestamps, cme_timestamp)
            matching_flares = []

            while flare_index > 0:
                flare_index -= 1
                flare_timestamp = flare_timestamps[flare_index]
                hour_diff = (cme_timestamp - flare_timestamp).total_seconds() / 3600
                if 0 < hour_diff < association_threshold:
                    matching_flares.append((hour_diff, flare_ids[flare_index], flare_class_scores[flare_index]))
                else:
                    break

            if matching_flares:
                closest_flare = sorted(matching_flares, key=lambda x: (-x[2], x[0]))[0]
                closest_flare_hours_diff, closest_flare_id, _ = closest_flare

        # For Dimmings
        closest_dimming_id = None
        closest_dimming_hours_diff = -1
        if dimming_timestamps:
            dimming_index = bisect.bisect_right(dimming_timestamps, cme_timestamp)
            matching_dimmings = []

            while dimming_index > 0:
                dimming_index -= 1
                dimming_timestamp = dimming_timestamps[dimming_index]
                hour_diff = (cme_timestamp - dimming_timestamp).total_seconds() / 3600
                if 0 < hour_diff < association_threshold:
                    matching_dimmings.append((hour_diff, dimming_ids[dimming_index]))
                else:
                    break

            if matching_dimmings:
                closest_dimming = min(matching_dimmings, key=lambda x: x[0])
                closest_dimming_hours_diff, closest_dimming_id = closest_dimming

        results[(harp, cme_id)] = {
            'closest_flare_id': closest_flare_id,
            'closest_flare_hours_diff': closest_flare_hours_diff,
            'closest_dimming_id': closest_dimming_id,
            'closest_dimming_hours_diff': closest_dimming_hours_diff
        }

# Now write the results to the databaset in the existing table
# -- CMES_HARPS_EVENTS: Events linked to CMEs and HARPNUMs
# CREATE TABLE CMES_HARPS_EVENTS (
#   harpnum INTEGER,                                -- Associated HARP region
#   cme_id INTEGER,                                 -- Associated CME
#   flare_id INTEGER REFERENCES FLARES (flare_id),  -- Associated flare event
#   flare_hours_diff INTEGER NOT NULL,              -- Time difference between CME and flare
#   dimming_id INTEGER REFERENCES DIMMINGS (dimming_id), -- Associated dimming event
#   dimming_hours_diff INTEGER NOT NULL,            -- Time difference between CME and dimming
#   PRIMARY KEY (harpnum, cme_id),                  -- Unique pairing of HARP and CME
#   FOREIGN KEY (harpnum, cme_id) REFERENCES CMES_HARPS_SPATIALLY_CONSIST (harpnum, cme_id) -- Foreign key reference to ensure valid associations
# );

conn.execute("DELETE FROM CMES_HARPS_EVENTS")

for (harp, cme_id), event in tqdm(results.items()):
    conn.execute("""
        INSERT INTO CMES_HARPS_EVENTS (harpnum, cme_id, flare_id, flare_hours_diff, dimming_id, dimming_hours_diff)
        VALUES (?, ?, ?, ?, ?, ?)
    """, (harp, cme_id, event['closest_flare_id'], event['closest_flare_hours_diff'], event['closest_dimming_id'], event['closest_dimming_hours_diff']))

conn.commit()
conn.close()

  0%|          | 0/4098 [00:00<?, ?it/s]

100%|██████████| 4098/4098 [00:01<00:00, 2060.88it/s]
100%|██████████| 14968/14968 [00:00<00:00, 528609.79it/s]


In [4]:
# Now in order to find every match we can choose all rows for each CME and sort
# first by which have a dimming and then by flare class

import pandas as pd

# Let's see how many potential matches there are

conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()

cur.execute("SELECT COUNT(DISTINCT cme_id) from CMES_HARPS_SPATIALLY_CONSIST")

print(f"There are {cur.fetchone()[0]} potential matches")

unique_cmes = cur.execute("SELECT DISTINCT cme_id from CMES_HARPS_SPATIALLY_CONSIST").fetchall()

# Now let's get the matches

def get_verfification_level(has_dimming, has_flare, flare_class, flare_threshold=25):
    if has_dimming:
        if has_flare:
            if flare_class > flare_threshold:
                return 1
            else:
                return 3
        else:
            return 5
    else:
        if has_flare:
            if flare_class > flare_threshold:
                return 2
            else:
                return 4
        else:
            return -1

matches = dict()

for unique_cme in tqdm(unique_cmes):
    query = f"""
    SELECT CHSC.cme_id, CHSC.harpnum, CHE.flare_id, CHE.dimming_id, F.flare_class_score from CMES_HARPS_SPATIALLY_CONSIST CHSC
    LEFT JOIN CMES_HARPS_EVENTS CHE ON CHSC.cme_id = CHE.cme_id AND CHSC.harpnum = CHE.harpnum
    LEFT JOIN FLARES F ON CHE.flare_id = F.flare_id
    WHERE CHSC.cme_id = {unique_cme[0]}
    """

    df = pd.read_sql_query(query, conn)

    # Need to replace dimming_id here with either 0 or 1

    df['has_dimming'] = df['dimming_id'].apply(lambda x: 0 if pd.isnull(x) else 1)

    sorted_df = df.sort_values(by=['has_dimming', 'flare_class_score'], ascending=False)

    top_choice = sorted_df.iloc[0]

    # This takes care of the edge case where there's a region with dimming but no flare
    # and a region with a flare but no dimming. We want to choose the one with the flare
    # over the one with the dimming
    if top_choice['has_dimming'] and pd.isnull(top_choice['flare_id']):
        # Select first where not has_dimming if exists. Take care of the case when there isn't an alternative
        no_dimming = sorted_df[sorted_df['has_dimming'] == 0]

        if len(no_dimming) > 0:
            alternative_choice = no_dimming.iloc[0]

            if not pd.isnull(alternative_choice["flare_id"]):
                top_choice = alternative_choice

    has_dimming = top_choice['has_dimming']
    has_flare = 0 if pd.isnull(top_choice['flare_id']) else 1
    flare_class = None if pd.isnull(top_choice['flare_class_score']) else top_choice['flare_class_score']
    harpnum = top_choice['harpnum']

    verification_level = get_verfification_level(has_dimming, has_flare, flare_class)

    if verification_level != -1:
        # There's a match!
        matches[unique_cme[0]] = {
            'harpnum': harpnum,
            'verification_level': verification_level
        }

conn.close()

There are 3752 potential matches


  0%|          | 0/3752 [00:00<?, ?it/s]

100%|██████████| 3752/3752 [00:04<00:00, 791.82it/s]


In [5]:
# Now add this to the database

conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()

conn.execute("DELETE FROM FINAL_CME_HARP_ASSOCIATIONS")

# Iterate through key, value pairs of matches
for cme_id, values in tqdm(matches.items()):
    harpnum = int(values["harpnum"])
    verification_score = int(values["verification_level"])

    association_method = "automatic"
    independent_verfied = 0

    # Add to database
    cur.execute("INSERT INTO FINAL_CME_HARP_ASSOCIATIONS (cme_id, harpnum, verification_score, association_method, independent_verified) VALUES (?, ?, ?, ?, ?)", (cme_id, harpnum, verification_score, association_method, independent_verfied))

100%|██████████| 1083/1083 [00:00<00:00, 354849.72it/s]


In [6]:
conn.commit()
conn.close()

In [18]:
# Once this is done we need to take care of external catalogues.

majumdar = pd.read_csv(MAJUMDAR_CATALOGUE, delimiter=r'\s+', header=0)

# Need to add cme_id as YYYYMMDDHHMMSS using the Date and Time columns

year = majumdar['Date'].astype(str).str[-4:]
month = majumdar['Date'].astype(str).str[3:5]
day = majumdar['Date'].astype(str).str[0:2]

majumdar['cme_id'] = (year + month + day + majumdar['Time'].astype(str)).replace({'-': '', ':': ''}, regex=True).astype(int)

# Now add Date + " " + Time
majumdar['cme_date'] = year + "-" + month + "-" + day + " " + majumdar['Time']

majumdar.head()

Unnamed: 0,Date,Time,CPA,Width,SR,Lat,Lon,Speed,cme_id,cme_date
0,02-01-1998,04:37:04,272,42,AR,20,42,481,19980102043704,1998-01-02 04:37:04
1,02-01-1998,14:30:21,258,45,AR,-24,78,518,19980102143021,1998-01-02 14:30:21
2,03-01-1998,09:42:59,290,85,PE,61,75,1020,19980103094259,1998-01-03 09:42:59
3,04-01-1998,02:57:17,352,59,PE,54,22,47,19980104025717,1998-01-04 02:57:17
4,04-01-1998,20:57:23,72,43,AR,35,-64,198,19980104205723,1998-01-04 20:57:23


In [22]:
# Print duplicate ids

dupli = majumdar.duplicated(subset=['cme_id'], keep=False)
duplicates = majumdar[dupli].sort_values(by='cme_id')
majumdar[majumdar["cme_id"] == 20131027223605]

Unnamed: 0,Date,Time,CPA,Width,SR,Lat,Lon,Speed,cme_id,cme_date
2630,27-10-2013,22:36:05,306,40,AR,6,63,340,20131027223605,2013-10-27 22:36:05


In [8]:
# See duplicates in cme_id

majumdar[majumdar.duplicated(subset=['cme_id'], keep=False)]

# Sadly I'm not going to change how I deal with cme_ids so I'll just have to drop these duplicates.
# Also these could be a huge headache if I choose the one I didn't choose for my catalogue as the regions will
# not agree. Let me see how many are after 2010.

print(f"There are {len(majumdar[majumdar.duplicated(subset=['cme_id'], keep=False) & (majumdar['Date'] > '2010-01-01')])} duplicates after 2010.")

# Well, with 14 duplicates I'm just going to drop all of them and be done with it. It's not ideal but changing
# how I deal with cme_ids would be a huge pain. So I drop ALL duplicates, keeping none

duplicates = majumdar.duplicated(subset=['cme_id'], keep=False)

majumdar_no_dups = majumdar[~duplicates]

# Check len of majumdar_no_dups

assert len(majumdar_no_dups) == len(majumdar) - sum(duplicates)

There are 14 duplicates after 2010.


In [12]:
# Need to put this into a table of the database

conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()

# Create table

conn.execute("DROP TABLE IF EXISTS MAJUMDAR_SRC")

conn.execute('''
    CREATE TABLE MAJUMDAR_SRC (
             cme_id INTEGER PRIMARY KEY,
             cme_date TEXT NOT NULL,
             cme_pa REAL,
             cme_width REAL,
             cme_speed REAL,
             sr_type TEXT,
             sr_lat REAL NOT NULL,
             sr_lon REAL NOT NULL
    )
''')

conn.commit()

# Insert data

conn.executemany('''
    INSERT INTO MAJUMDAR_SRC (cme_id, cme_date, cme_pa, cme_width, cme_speed, sr_type, sr_lat, sr_lon)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', majumdar_no_dups[["cme_id", "cme_date", "CPA", "Width", "Speed", "SR", "Lat", "Lon"]].to_numpy())

conn.commit()
conn.close()

In [13]:
# Now let's see if the majumdar catalogue used compatible times for the CMEs so I can match them

conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()

# We want to do an INNER JOIN between the two and select the PA of both to compare
cur.execute("""
SELECT
    FCHA.cme_id
FROM
    FINAL_CME_HARP_ASSOCIATIONS FCHA
INNER JOIN
    MAJUMDAR_SRC majumdar
ON
    FCHA.cme_id = majumdar.cme_id
""")

# Get the results
results = cur.fetchall()
print(results)

# They do seem to match

[(20100904151209,), (20100911182405,), (20100922042405,), (20101013171206,), (20101017093605,), (20101019071205,), (20101106161206,), (20101111140005,), (20101113104806,), (20101116012527,), (20101116073605,), (20101116080005,), (20101130191205,), (20110118040006,), (20110122133606,), (20110124020005,), (20110127132548,), (20110128050007,), (20110224074805,), (20110306233605,), (20110307144836,), (20110308041205,), (20110308190006,), (20110308201205,), (20110314042407,), (20110317123606,), (20110319164805,), (20110323023606,), (20110324124807,), (20110401041742,), (20110402113605,), (20110409154806,), (20110410002407,), (20110413220007,), (20110422182406,), (20110427023605,), (20110427074806,), (20110501142407,), (20110502171206,), (20110504174807,), (20110511024806,), (20110519043606,), (20110529103606,), (20110529212408,), (20110530113606,), (20110601183606,), (20110606064549,), (20110611132551,), (20110612013624,), (20110619164806,), (20110629004806,), (20111226171205,), (2012011600

In [14]:
# IGNORE ALL OF THIS AS IT'S NOT NEEDED

# A different approach, for each of the MAJUMDAR_SRC cmes, I find the closest cme_id in CMES
# and then update the MAJUMDAR_SRC cme_id with the CMES cme_id
# 
# madjumar_ids = cur.execute("SELECT cme_id FROM MAJUMDAR_SRC").fetchall()
# 
# for madjumar_id in tqdm(madjumar_ids):
#     cur.execute("""
#                 SELECT c.cme_id FROM CMES c
#                 ORDER BY ABS(c.cme_id - ?) ASC
#                 LIMIT 1
#                 """ , (madjumar_id[0],))
# 
#     cme_id = cur.fetchone()[0]
# 
#     cur.execute("""
#                 UPDATE MAJUMDAR_SRC
#                 SET matched_cme_id = ?
#                 WHERE cme_id = ?
#                 """ , (cme_id, madjumar_id[0],))
# 
# conn.commit()

In [15]:
# To check quickly how regions compare more or less

df = pd.read_sql_query("""
SELECT FCHA.cme_id, FCHA.harpnum, hbb.timestamp, hbb.LONDTMIN, hbb.LONDTMAX, hbb.LATDTMIN, hbb.LATDTMAX, M.sr_lon, M.sr_lat, FCHA.verification_score FROM FINAL_CME_HARP_ASSOCIATIONS FCHA
INNER JOIN MAJUMDAR_SRC M
ON M.cme_id = FCHA.cme_id
INNER JOIN CMES c
ON c.cme_id = FCHA.cme_id
INNER JOIN HARPS_BBOX hbb
ON FCHA.harpnum = hbb.harpnum and c.image_timestamp = hbb.timestamp
""", conn)

df

Unnamed: 0,cme_id,harpnum,timestamp,LONDTMIN,LONDTMAX,LATDTMIN,LATDTMAX,sr_lon,sr_lat,verification_score
0,20100904151209,145,2010-09-04 15:12:00,60.871201,74.176804,9.350700,15.233400,66.0,9.0,4.0
1,20100911182405,175,2010-09-11 18:24:00,-94.812897,-61.171299,-26.784100,-13.953300,-149.0,-23.0,4.0
2,20100922042405,187,2010-09-22 04:24:00,-99.733299,-69.272400,16.078400,28.337299,-23.0,16.0,4.0
3,20101013171206,218,2010-10-13 17:12:00,-92.679901,-75.796402,12.978100,22.528999,-94.0,36.0,4.0
4,20101017093605,211,2010-10-17 09:36:00,16.876200,51.811901,-28.323099,-13.619900,35.0,-24.0,4.0
...,...,...,...,...,...,...,...,...,...,...
247,20170815140005,7107,2017-08-15 14:00:00,-85.581703,-53.285702,6.217000,15.725200,-65.0,7.0,4.0
248,20170830164805,7110,2017-08-30 16:48:00,44.667198,65.438797,1.065500,13.083200,60.0,4.0,4.0
249,20170831041205,7117,2017-08-31 04:12:00,-83.571701,-50.633301,6.966900,21.252800,-66.0,13.0,4.0
250,20170925221205,7148,2017-09-25 22:12:00,-100.946098,-69.178001,7.490700,23.080900,-62.0,11.0,4.0


In [17]:
# This adds matching regions to the Madjumar catalogue

conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()

cur.execute("ALTER TABLE MAJUMDAR_SRC ADD COLUMN matching_harps INTEGER REFERENCES HARPS (harpnum) DEFAULT NULL;")

matches = pd.read_sql("""
WITH FilteredData AS (
    SELECT 
        M.cme_id, 
        M.sr_lon, 
        M.sr_lat, 
        hbb.harpnum, 
        CASE
            WHEN M.sr_lon BETWEEN (hbb.LONDTMIN - 5) AND (hbb.LONDTMAX + 5) THEN 1
            ELSE 0
        END AS lon_ok,
        CASE 
            WHEN M.sr_lat BETWEEN (hbb.LATDTMIN - 5) AND (hbb.LATDTMAX + 5) THEN 1
            ELSE 0
        END AS lat_ok
    FROM 
        MAJUMDAR_SRC M
    INNER JOIN 
        CMES C ON C.cme_id = M.cme_id
    INNER JOIN 
        HARPS_BBOX hbb ON hbb.timestamp = C.image_timestamp
    WHERE 
        M.sr_type = "AR" AND C.cme_date > (SELECT MIN(timestamp) FROM HARPS_BBOX)
)

SELECT 
    cme_id,
    sr_lon,
    sr_lat,
    COALESCE(MAX(CASE WHEN lon_ok = 1 AND lat_ok = 1 THEN harpnum END), NULL) AS match
FROM 
    FilteredData
GROUP BY 
    cme_id;
""", conn)

for index, row in matches.iterrows():
    cur.execute("UPDATE MAJUMDAR_SRC SET matching_harps = ? WHERE cme_id = ?", (row['match'], int(row['cme_id'])))

conn.commit()
conn.close()