In [1]:
import sys
sys.path.append('../')

from src.cmesrc.config import CMESRC_DB
import sqlite3
from tqdm import tqdm
from datetime import datetime
import bisect

In [2]:
conn = sqlite3.connect(CMESRC_DB)
conn.execute("PRAGMA foreign_keys = ON")
cur = conn.cursor()

In [3]:
# First I need to assign an image timestamp to each event

image_timestamps = cur.execute("SELECT timestamp FROM images").fetchall()

# Convert to datetime objects

image_timestamps = [t[0] for t in image_timestamps]

In [6]:
timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in image_timestamps])
image_timestamps = timestamps

# Function to find the closest timestamp using binary search
def closest_timestamp(target, sorted_timestamps):
    index = bisect.bisect_left(sorted_timestamps, target)
    if index == 0:
        return sorted_timestamps[0]
    if index == len(sorted_timestamps):
        return sorted_timestamps[-1]
    before = sorted_timestamps[index - 1]
    after = sorted_timestamps[index]
    if after - target < target - before:
       return after
    else:
       return before

# First flares
columns = cur.execute("PRAGMA table_info(flares)").fetchall()

if "image_timestamp" not in [c[1] for c in columns]:
    cur.execute("ALTER TABLE flares ADD COLUMN image_timestamp TEXT REFERENCES images(timestamp)")
else:
    print("Column already exists")

for row in tqdm(cur.execute("SELECT flare_id, flare_date FROM flares").fetchall()):
    flare_timestamp = datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S")
    image_timestamp = closest_timestamp(flare_timestamp, image_timestamps)
    cur.execute("UPDATE flares SET image_timestamp = ? WHERE flare_id = ?", (image_timestamp.strftime("%Y-%m-%d %H:%M:%S"), row[0]))

# Now dimmings

columns = cur.execute("PRAGMA table_info(dimmings)").fetchall()

if "image_timestamp" not in [c[1] for c in columns]:
    cur.execute("ALTER TABLE dimmings ADD COLUMN image_timestamp TEXT REFERENCES images(timestamp)")
else:
    print("Column already exists")

for row in tqdm(cur.execute("SELECT dimming_id, dimming_start_date FROM dimmings").fetchall()):
    dimming_timestamp = datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S")
    image_timestamp = closest_timestamp(dimming_timestamp, image_timestamps)
    cur.execute("UPDATE dimmings SET image_timestamp = ? WHERE dimming_id = ?", (image_timestamp.strftime("%Y-%m-%d %H:%M:%S"), row[0]))

# Now CMEs

columns = cur.execute("PRAGMA table_info(cmes)").fetchall()

if "image_timestamp" not in [c[1] for c in columns]:
    cur.execute("ALTER TABLE cmes ADD COLUMN image_timestamp TEXT REFERENCES images(timestamp)")
else:
    print("Column already exists")

for row in tqdm(cur.execute("SELECT cme_id, cme_date FROM cmes").fetchall()):
    cme_timestamp = datetime.strptime(row[1], "%Y-%m-%d %H:%M:%S")
    image_timestamp = closest_timestamp(cme_timestamp, image_timestamps)
    cur.execute("UPDATE cmes SET image_timestamp = ? WHERE cme_id = ?", (image_timestamp.strftime("%Y-%m-%d %H:%M:%S"), row[0]))

Column already exists


100%|██████████| 12580/12580 [00:00<00:00, 97853.46it/s]


Column already exists


100%|██████████| 2591/2591 [00:00<00:00, 91735.46it/s]

Column already exists



100%|██████████| 31944/31944 [00:00<00:00, 101868.42it/s]


In [8]:
conn.commit()

In [5]:
harps = cur.execute("SELECT DISTINCT harpnum from harps_bbox").fetchall()
len(harps)

4093

In [6]:
columns = cur.execute("PRAGMA table_info(harps_bbox)").fetchall()

DROP_COLUMNS = False

if DROP_COLUMNS:
    cur.executescript("""
    ALTER TABLE harps_bbox DROP COLUMN prev_flare_id;
    ALTER TABLE harps_bbox DROP COLUMN next_flare_id;
    ALTER TABLE harps_bbox DROP COLUMN prev_flare_hours;
    ALTER TABLE harps_bbox DROP COLUMN next_flare_hours;
    ALTER TABLE harps_bbox DROP COLUMN prev_dimming_id;
    ALTER TABLE harps_bbox DROP COLUMN next_dimming_id;
    ALTER TABLE harps_bbox DROP COLUMN prev_dimming_hours;
    ALTER TABLE harps_bbox DROP COLUMN next_dimming_hours;
    ALTER TABLE harps_bbox DROP COLUMN prev_cme_id;
    ALTER TABLE harps_bbox DROP COLUMN next_cme_id;
    ALTER TABLE harps_bbox DROP COLUMN prev_cme_hours;
    ALTER TABLE harps_bbox DROP COLUMN next_cme_hours;
    ALTER TABLE harps_bbox DROP COLUMN prev_present_at_cme_id;
    ALTER TABLE harps_bbox DROP COLUMN next_present_at_cme_id;
    ALTER TABLE harps_bbox DROP COLUMN prev_present_at_cme_hours;
    ALTER TABLE harps_bbox DROP COLUMN next_present_at_cme_hours;
    """)

if "prev_flare_id" not in [c[1] for c in columns]:
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_flare_id INTEGER REFERENCES flares(flare_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_flare_id INTERGER REFERENCES flares(flare_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_flare_hours REAL NOT NULL DEFAULT -1")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_flare_hours REAL NOT NULL DEFAULT -1")

    # Same for dimmings
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_dimming_id INTEGER REFERENCES dimmings(dimming_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_dimming_id INTEGER REFERENCES dimmings(dimming_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_dimming_hours REAL NOT NULL DEFAULT -1")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_dimming_hours REAL NOT NULL DEFAULT -1")

    # Same for CMEs (this is a positive CME association, though with the decision tree I made with Lucie this isn't really needed anymore)

    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_cme_id TEXT INTEGER REFERENCES cmes(cme_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_cme_id TEXT INTEGER  REFERENCES cmes(cme_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_cme_hours REAL NOT NULL DEFAULT -1")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_cme_hours REAL NOT NULL DEFAULT -1")

    # Same for being present at cme

    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_present_at_cme_id INTEGER REFERENCES cmes(cme_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_present_at_cme_id INTEGER REFERENCES cmes(cme_id)")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN prev_present_at_cme_hours REAL NOT NULL DEFAULT -1")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN next_present_at_cme_hours REAL NOT NULL DEFAULT -1")

conn.commit()

In [7]:
cur.execute("DROP TABLE IF EXISTS accepted_cme_associations_temp")
cur.execute("DROP TABLE IF EXISTS accepted_cme_associations")

# You just basically need the CME_ID as it can be associated with a single HARPNUM
cur.execute("""
CREATE TABLE accepted_cme_associations_temp AS
SELECT 
    cmes_harpnums.cme_id 
FROM 
    cmes_harpnums
INNER JOIN 
    flares 
ON 
    cmes_harpnums.flare_id = flares.flare_id
WHERE 
    flares.flare_class_score >= 25
AND
    cmes_harpnums.dimming_id IS NOT NULL
AND
    cmes_harpnums.flare_id IS NOT NULL
AND
    cmes_harpnums.harps_spat_consist = 1
""")

cur.execute("""
CREATE TABLE accepted_cme_associations (
    cme_id INTEGER NOT NULL PRIMARY KEY REFERENCES cmes_harpnums(cme_id)
)
""")

cur.execute("""
INSERT INTO accepted_cme_associations (cme_id)
SELECT cme_id FROM accepted_cme_associations_temp
""")

cur.execute("""
DROP TABLE accepted_cme_associations_temp
""")

conn.commit()


In [8]:
cur.execute("SELECT COUNT(*) FROM accepted_cme_associations").fetchall()

[(155,)]

In [9]:
cur.execute("CREATE INDEX IF NOT EXISTS harps_bbox_harpnum_idx ON harps_bbox(harpnum)")
cur.execute("CREATE INDEX IF NOT EXISTS flares_harpnum_idx ON flares(harpnum)")
cur.execute("CREATE INDEX IF NOT EXISTS flares_verification_idx ON flares(flare_verification)")
cur.execute("CREATE INDEX IF NOT EXISTS dimmings_harpnum_idx ON dimmings(harpnum)")
cur.execute("CREATE INDEX IF NOT EXISTS spat_consist_harps_harpnum_idx ON spat_consist_harpnums(harpnum)")
cur.execute("CREATE INDEX IF NOT EXISTS harps_bbox_timestamp_idx ON harps_bbox(timestamp)")
cur.execute("CREATE INDEX IF NOT EXISTS harps_bbox_harpnum_idx ON harps_bbox(harpnum)")

conn.commit()

In [10]:
for harp in tqdm(harps):
    harp = harp[0]
    timestamps = cur.execute("SELECT timestamp FROM harps_bbox WHERE harpnum = ?", (harp,)).fetchall()

    flare_data  = cur.execute("SELECT image_timestamp, flare_id FROM flares WHERE harpnum = ? AND flare_verification != 'Non-verfied'", (harp,)).fetchall()

    if len(flare_data) == 0:
        flare_timestamps = []
        flare_ids = []
    else:
        flare_timestamps, flare_ids = zip(*flare_data)

    dimming_data =cur.execute("SELECT image_timestamp, dimming_id FROM dimmings WHERE harpnum = ?", (harp,)).fetchall()

    if len(dimming_data) == 0:
        dimming_timestamps = []
        dimming_ids = []
    else:
        dimming_timestamps, dimming_ids = zip(*dimming_data)


    cme_data= cur.execute("""
                    SELECT cmes.image_timestamp, cmes.cme_id FROM accepted_cme_associations as aca
                    INNER JOIN cmes
                    ON aca.cme_id = cmes.cme_id
                    INNER JOIN cmes_harpnums as ch
                    ON ch.cme_id = cmes.cme_id AND ch.harpnum = ?
                    """, (harp,)).fetchall()

    if len(cme_data) == 0:
        cme_timestamps = []
        cme_ids = []
    else:
        cme_timestamps, cme_ids = zip(*cme_data)

    present_at_cme_data = cur.execute("""
        SELECT c.image_timestamp, c.cme_id from spat_consist_harpnums as sch
        INNER JOIN cmes as c
        ON sch.cme_id = c.cme_id
        WHERE sch.harpnum = ? 
    """, (harp,)).fetchall()

    if len(present_at_cme_data) == 0:
        present_at_cme_timestamps = []
        present_at_cme_ids = []
    else:
        present_at_cme_timestamps, present_at_cme_ids = zip(*present_at_cme_data)



    # Need to convert to datetime objects
    timestamps = sorted([datetime.strptime(t[0], "%Y-%m-%d %H:%M:%S") for t in timestamps])
    flare_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in flare_timestamps])
    dimming_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in dimming_timestamps])
    cme_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in cme_timestamps])
    present_at_cme_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in present_at_cme_timestamps])

    timestamps_list = [flare_timestamps, dimming_timestamps, cme_timestamps, present_at_cme_timestamps]
    ids_list = [flare_ids, dimming_ids, cme_ids, present_at_cme_ids]
    events_lists = ["flare", "dimming", "cme", "present_at_cme"]

    for timestamp in timestamps:
        for event, event_timestamps, event_ids in zip(events_lists, timestamps_list, ids_list):
            if not event_timestamps:
                # cur.execute(f"UPDATE harps_bbox SET prev_{event}_id = NULL")
                # cur.execute(f"UPDATE harps_bbox SET next_{event}_id = NULL")
                # cur.execute(f"UPDATE harps_bbox SET prev_{event}_hours = -1")
                # cur.execute(f"UPDATE harps_bbox SET next_{event}_hours = -1")
                continue
            
            try:
                index = bisect.bisect_right(event_timestamps, timestamp)
            except TypeError:
                print(harp)
                print(event)
                print(event_timestamps)
                print(event_ids)
                print(timestamp)

            if index == 0:
                closest_past_timestamp_index = None
                closest_future_timestamp_index = 0
            elif index == len(event_timestamps):
                closest_past_timestamp_index = len(event_timestamps) - 1
                closest_future_timestamp_index = None
            else:
                closest_past_timestamp_index = index - 1
                closest_future_timestamp_index = index

            if closest_past_timestamp_index is None:
                past_event_timestamp = -1
                past_event_id = None
                past_event_time_diff = -1
            else:
                past_event_timestamp = event_timestamps[closest_past_timestamp_index]
                past_event_id = event_ids[closest_past_timestamp_index]
                past_event_time_diff = (timestamp - past_event_timestamp).total_seconds() / 3600
                future_event_time_diff = -1

            if closest_future_timestamp_index is None:
                future_event_timestamp = -1
                future_event_id = None
                future_event_time_diff = -1
            else:
                future_event_timestamp = event_timestamps[closest_future_timestamp_index]
                future_event_id = event_ids[closest_future_timestamp_index]
                future_event_time_diff = (future_event_timestamp - timestamp).total_seconds() / 3600
            
            if past_event_time_diff != -1 and past_event_time_diff < 0:
                raise ValueError("Past event time difference is negative")

            cur.execute(f"UPDATE harps_bbox SET prev_{event}_id = ? WHERE harpnum = ? AND timestamp = ?", (past_event_id, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
            cur.execute(f"UPDATE harps_bbox SET next_{event}_id = ? WHERE harpnum = ? AND timestamp = ?", (future_event_id, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
            cur.execute(f"UPDATE harps_bbox SET prev_{event}_hours = ? WHERE harpnum = ? AND timestamp = ?", (past_event_time_diff, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
            cur.execute(f"UPDATE harps_bbox SET next_{event}_hours = ? WHERE harpnum = ? AND timestamp = ?", (future_event_time_diff, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))

conn.commit()

100%|██████████| 4093/4093 [01:19<00:00, 51.19it/s] 


In [12]:
columns = cur.execute("PRAGMA table_info(harps_bbox)").fetchall()

if "closest_flare_next_present_at_cme_hours" not in [c[1] for c in columns]:
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN closest_flare_next_present_at_cme_hours INTEGER NOT NULL DEFAULT -1")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN closest_flare_next_present_at_cme_id INTEGER REFERENCES flares(flare_id)")

    # Same for dimmings
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN closest_dimming_next_present_at_cme_hours INTEGER NOT NULL DEFAULT -1")
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN closest_dimming_next_present_at_cme_id INTEGER REFERENCES dimmings(dimming_id)")

conn.commit()

In [13]:
association_threshold = 3

for harp in tqdm(harps):
    harp = harp[0]
    timestamps = cur.execute("SELECT timestamp FROM harps_bbox WHERE harpnum = ?", (harp,)).fetchall()

    flare_data  = cur.execute("SELECT image_timestamp, flare_id, flare_class_score FROM flares WHERE harpnum = ? AND flare_verification != 'Non-verfied'", (harp,)).fetchall()

    if len(flare_data) == 0:
        flare_timestamps = []
        flare_ids = []
        flare_class_scores = []
    else:
        flare_timestamps, flare_ids, flare_class_scores = zip(*flare_data)

    dimming_data =cur.execute("SELECT image_timestamp, dimming_id FROM dimmings WHERE harpnum = ?", (harp,)).fetchall()

    if len(dimming_data) == 0:
        dimming_timestamps = []
        dimming_ids = []
    else:
        dimming_timestamps, dimming_ids = zip(*dimming_data)

    present_at_cme_data = cur.execute("""
        SELECT c.image_timestamp, c.cme_id from spat_consist_harpnums as sch
        INNER JOIN cmes as c
        ON sch.cme_id = c.cme_id
        WHERE sch.harpnum = ? 
    """, (harp,)).fetchall()

    if len(present_at_cme_data) == 0:
        present_at_cme_timestamps = []
        present_at_cme_ids = []
    else:
        present_at_cme_timestamps, present_at_cme_ids = zip(*present_at_cme_data)



    # Need to convert to datetime objects
    timestamps = sorted([datetime.strptime(t[0], "%Y-%m-%d %H:%M:%S") for t in timestamps])
    flare_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in flare_timestamps])
    dimming_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in dimming_timestamps])
    present_at_cme_timestamps = sorted([datetime.strptime(t, "%Y-%m-%d %H:%M:%S") for t in present_at_cme_timestamps])

    timestamps_list = [flare_timestamps, dimming_timestamps, cme_timestamps, present_at_cme_timestamps]
    ids_list = [flare_ids, dimming_ids, cme_ids, present_at_cme_ids]
    events_lists = ["flare", "dimming"]

    for timestamp in timestamps:

        index = bisect.bisect_right(present_at_cme_timestamps, timestamp)

        if index == 0:
            closest_future_timestamp_index = 0
        elif index == len(present_at_cme_timestamps):
            closest_future_timestamp_index = None
        else:
            closest_future_timestamp_index = index
        
        if closest_future_timestamp_index is None or len(present_at_cme_timestamps) == 0:
            closest_flare_id = None
            closest_flare_hours_diff = -1
            closest_dimming_id = None
            closest_dimming_hours_diff = -1
        
        else:
            closest_present_at_cme_timestamp = present_at_cme_timestamps[closest_future_timestamp_index]

            # First flares

            if len(flare_timestamps) == 0:
                closest_flare_id = None
                closest_flare_hours_diff = -1
            else:

                matching_flares_hour_diff = []
                matching_flares_id = []
                matching_flares_classes = []

                matching_flare_diff = 0.01
                flare_index = bisect.bisect_right(flare_timestamps, closest_present_at_cme_timestamp)

                flare_index -= 1

                while (0 < matching_flare_diff < association_threshold) and flare_index > 0:
                    flare_timestamp = flare_timestamps[flare_index]

                    matching_flare_diff = (closest_present_at_cme_timestamp - flare_timestamp).total_seconds() / 3600

                    if -1e3 < matching_flare_diff < association_threshold:
                        matching_flares_hour_diff.append(matching_flare_diff)
                        matching_flares_id.append(flare_ids[flare_index])
                        matching_flares_classes.append(flare_class_scores[flare_index])

                        flare_index -= 1
                
                if len(matching_flares_hour_diff) == 0:
                    closest_flare_id = None
                    closest_flare_hours_diff = -1
                
                else:
                    # Get index of highest class flare
                    highest_class_index = matching_flares_classes.index(max(matching_flares_classes))
                    closest_flare_id = matching_flares_id[highest_class_index]
                    closest_flare_hours_diff = matching_flares_hour_diff[highest_class_index]

            # Now dimmings
            
            if len(dimming_timestamps) == 0:
                closest_dimming_id = None
                closest_dimming_hours_diff = -1

            else:

                matching_dimmings_hour_diff = []
                matching_dimmings_id = []

                matching_dimming_diff = 0.01
                dimming_index = bisect.bisect_right(dimming_timestamps, closest_present_at_cme_timestamp)

                dimming_index -= 1

                while (0 < matching_dimming_diff < association_threshold) and (dimming_index > 0):
                    dimming_timestamp = dimming_timestamps[dimming_index]

                    matching_dimming_diff = (closest_present_at_cme_timestamp - dimming_timestamp).total_seconds() / 3600

                    if -1e-3 < matching_dimming_diff < association_threshold:
                        matching_dimmings_hour_diff.append(matching_dimming_diff)
                        matching_dimmings_id.append(dimming_ids[dimming_index])

                        dimming_index -= 1
                    
                if len(matching_dimmings_hour_diff) == 0:
                    closest_dimming_id = None
                    closest_dimming_hours_diff = -1
                else:
                    # Get index of smallest hour diff
                    closest_dimming_hours_diff = min(matching_dimmings_hour_diff)
                    closest_dimming_id = matching_dimmings_id[matching_dimmings_hour_diff.index(closest_dimming_hours_diff)]

        cur.execute("UPDATE harps_bbox SET closest_flare_next_present_at_cme_id = ? WHERE harpnum = ? AND timestamp = ?", (closest_flare_id, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
        cur.execute("UPDATE harps_bbox SET closest_flare_next_present_at_cme_hours = ? WHERE harpnum = ? AND timestamp = ?", (closest_flare_hours_diff, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
        cur.execute("UPDATE harps_bbox SET closest_dimming_next_present_at_cme_id = ? WHERE harpnum = ? AND timestamp = ?", (closest_dimming_id, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
        cur.execute("UPDATE harps_bbox SET closest_dimming_next_present_at_cme_hours = ? WHERE harpnum = ? AND timestamp = ?", (closest_dimming_hours_diff, harp, timestamp.strftime("%Y-%m-%d %H:%M:%S")))

conn.commit()

100%|██████████| 4093/4093 [00:45<00:00, 90.74it/s] 


In [None]:
association_threshold = 3

results = {}

for harp in tqdm(harps):
    harp = harp[0]

    # Extract Flare, Dimming, and CME data
    # ... [no change here]

    for cme_timestamp, cme_id in zip(present_at_cme_timestamps, present_at_cme_ids):

        # For Flares
        closest_flare_id = None
        closest_flare_hours_diff = -1
        if flare_timestamps:
            flare_index = bisect.bisect_right(flare_timestamps, cme_timestamp)
            matching_flares = []

            while flare_index > 0:
                flare_index -= 1
                flare_timestamp = flare_timestamps[flare_index]
                hour_diff = (cme_timestamp - flare_timestamp).total_seconds() / 3600
                if 0 < hour_diff < association_threshold:
                    matching_flares.append((hour_diff, flare_ids[flare_index], flare_class_scores[flare_index]))
                else:
                    break

            if matching_flares:
                closest_flare = sorted(matching_flares, key=lambda x: (-x[2], x[0]))[0]
                closest_flare_hours_diff, closest_flare_id, _ = closest_flare

        # For Dimmings
        closest_dimming_id = None
        closest_dimming_hours_diff = -1
        if dimming_timestamps:
            dimming_index = bisect.bisect_right(dimming_timestamps, cme_timestamp)
            matching_dimmings = []

            while dimming_index > 0:
                dimming_index -= 1
                dimming_timestamp = dimming_timestamps[dimming_index]
                hour_diff = (cme_timestamp - dimming_timestamp).total_seconds() / 3600
                if 0 < hour_diff < association_threshold:
                    matching_dimmings.append((hour_diff, dimming_ids[dimming_index]))
                else:
                    break

            if matching_dimmings:
                closest_dimming = min(matching_dimmings, key=lambda x: x[0])
                closest_dimming_hours_diff, closest_dimming_id = closest_dimming

        results[(harp, cme_id)] = {
            'closest_flare_id': closest_flare_id,
            'closest_flare_hours_diff': closest_flare_hours_diff,
            'closest_dimming_id': closest_dimming_id,
            'closest_dimming_hours_diff': closest_dimming_hours_diff
        }

In [14]:
# THIS IS THE IMPORTANT BIT THAT HAS ASSIGNED THE LABELS
# THIS IS THE ONLY THING THAT SHOULD BE CHANGED IN THE FUTURE
def assign_label(row, forecast_horizon=24.01, cme_relation_threshold=association_threshold, flare_positive_threshold=25, flare_negative_threshold=20):

    yes_cme = row["next_present_at_cme_hours"]
    no_cme = -1
    unknown = -2

    next_cme_hours = row["next_present_at_cme_hours"]
    closest_dimming_cme_hours = row["closest_dimming_next_present_at_cme_hours"]
    closest_flare_cme_hours = row["closest_flare_next_present_at_cme_hours"]
    closest_flare_cme_class = row["closest_flare_next_present_at_cme_class"]

    if next_cme_hours == -1:
        present_at_cme = False
    else:
        present_at_cme = row["next_present_at_cme_hours"] <= forecast_horizon

    if closest_dimming_cme_hours == -1:
        dimming_associated_with_cme = False
    else:
        dimming_associated_with_cme = 0 < row["closest_dimming_next_present_at_cme_hours"] < cme_relation_threshold

    if closest_flare_cme_hours == -1:
        flare_associated_with_cme = False
        flare_class_above_positive_threshold = False
        flare_class_below_negative_threshold = False
    else:
        flare_associated_with_cme = 0 < row["closest_flare_next_present_at_cme_hours"] < cme_relation_threshold
        flare_class_above_positive_threshold = row["closest_flare_next_present_at_cme_class"] >= flare_positive_threshold
        flare_class_below_negative_threshold = row["closest_flare_next_present_at_cme_class"] <= flare_negative_threshold

    if present_at_cme:
        # If so, was there a dimming at the time of the CME?
        if dimming_associated_with_cme:
            # If so, was there a flare at the time of the CME?
            if flare_associated_with_cme:
                # If there was a flare, is it class big enough?
                if flare_class_above_positive_threshold:
                    return yes_cme

                # If it isn't, we're unsure of the label
                else:
                    return unknown
            # If there wasn't a flare, we're unsure of the label
            else:
                return unknown

        # If there wasn't a dimming
        else:
            # Was there a flare?
            if flare_associated_with_cme:
                # If there was a flare, is it class small enough to ignore it?
                if flare_class_below_negative_threshold:
                    return no_cme
                else:
                    return unknown
            else:
                return no_cme

    # If the region wasn't present at a CME
    else:
        return no_cme


In [15]:
# Check if column label exists

cur.execute("PRAGMA table_info(harps_bbox)")
columns = cur.fetchall()

label_exists = False

for column in columns:
    if column[1] == "label":
        label_exists = True
        break

if not label_exists:
    cur.execute("ALTER TABLE harps_bbox ADD COLUMN label INTEGER")

total_rows = 0
cur.execute("""SELECT COUNT(*) FROM harps_bbox AS hb
            LEFT JOIN flares AS f
            ON hb.next_flare_id = f.flare_id
            """)

total_rows = cur.fetchone()[0]

cur.execute("""SELECT hb.harpnum, hb.timestamp, hb.closest_flare_next_present_at_cme_hours, f.flare_class_score, hb.closest_dimming_next_present_at_cme_hours, hb.next_present_at_cme_hours
            FROM harps_bbox AS hb
            LEFT JOIN flares AS f
            ON hb.closest_flare_next_present_at_cme_id = f.flare_id
            """)

rows = cur.fetchall()

for row in tqdm(rows):
    row_dict = dict()

    row_dict["harpnum"] = row[0]
    row_dict["timestamp"] = row[1]
    row_dict["closest_flare_next_present_at_cme_hours"] = row[2]
    row_dict["closest_flare_next_present_at_cme_class"] = row[3]
    row_dict["closest_dimming_next_present_at_cme_hours"] = row[4]
    row_dict["next_present_at_cme_hours"] = row[5]

    label = assign_label(row_dict)

    if label == "STOP":
        break

    cur.execute("UPDATE harps_bbox SET label = ? WHERE harpnum = ? AND timestamp = ?", (label, row_dict["harpnum"], row_dict["timestamp"]))

conn.commit()

100%|██████████| 2628572/2628572 [00:07<00:00, 335428.69it/s]


In [16]:
# Find what unique values of the harpnum column with label > 0 are both in the harps_bbox table and the cutouts_for_download table

# Remember, the harpnum has to be both in the harps_bbox table and the cutouts_for_download table
cur.execute("""SELECT COUNT(DISTINCT hb.harpnum)
            FROM harps_bbox AS hb
            WHERE hb.label > 0
            """)

print(f"Number of unique harps with label > 0 in harps_bbox table: {cur.fetchone()[0]}")


cur.execute("""SELECT COUNT(DISTINCT hb.harpnum)
            FROM harps_bbox AS hb
            INNER JOIN cutouts_for_download AS cfd
            ON hb.harpnum = cfd.harpnum
            WHERE hb.label > 0
            """)

print(f"Number of unique harps with label > 0 in harps_bbox table and cutouts_for_download table: {cur.fetchone()[0]}")

Number of unique harps with label > 0 in harps_bbox table: 57
Number of unique harps with label > 0 in harps_bbox table and cutouts_for_download table: 56


In [17]:
cur.execute("DROP TABLE IF EXISTS images_dataset")

cur.execute("""
    CREATE TABLE IF NOT EXISTS images_dataset (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        harpnum INTEGER NOT NULL REFERENCES harps(harpnum),
        timestamp INTEGER NOT NULL REFERENCES images(timestamp),
        zarr_index INTEGER NOT NULL,
        zarr_path TEXT NOT NULL,
        UNIQUE (harpnum, timestamp)
        )
""")

conn.commit()

In [18]:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_images_dataset_harpnum ON cutouts_for_download(harpnum)""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_images_dataset_timestamp ON cutouts_for_download(timestamp)""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_images_dataset_harpnum_timestamp ON cutouts_for_download(harpnum, timestamp)""")

conn.commit()

def get_rows_grouped_by_year(harpnum):
    con = sqlite3.connect(CMESRC_DB)
    con.execute("PRAGMA foreign_keys = ON")
    cur = con.cursor()

    cur.execute(
        f"""
        SELECT DISTINCT strftime('%Y', timestamp) FROM cutouts_for_download WHERE harpnum= ?
    """, (harpnum,))

    years = [row[0] for row in cur.fetchall()]

    rows = dict()

    for year in years:
        cur.execute("DROP TABLE IF EXISTS cutouts_for_download_temp")

        cur.execute(
            f"""
            CREATE TEMPORARY TABLE cutouts_for_download_temp AS
            SELECT hpb.timestamp, hpb.x_cen, hpb.y_cen, hpbs.width, hpbs.height, i.idx
            FROM cutouts_for_download cfd 
            INNER JOIN harps_pixel_bbox hpb ON cfd.harpnum = hpb.harpnum AND cfd.timestamp = hpb.timestamp
            INNER JOIN images i ON cfd.timestamp = i.timestamp
            INNER JOIN harps_pixel_bbox_sizes hpbs ON hpb.harpnum = hpbs.harpnum
            WHERE cfd.harpnum= ? AND strftime('%Y', cfd.timestamp) = ?
        """, (harpnum, year))

        cur.execute(
            """
            WITH cutouts_hours AS (
            SELECT *,
                strftime('%Y-%m-%d %H:00:00', timestamp) AS hour,
                ABS(julianday(timestamp) - julianday(strftime('%Y-%m-%d %H:00:00', timestamp))) * 24 * 60 * 60 AS diff
            FROM cutouts_for_download_temp
            )

            SELECT timestamp, x_cen, y_cen, width, height, idx FROM (
            SELECT *,
                RANK() OVER (PARTITION BY hour ORDER BY diff ASC) AS rank
                FROM cutouts_hours
            )
            WHERE rank = 1
            """
        )

        rows[year] = cur.fetchall()

    con.close()

    total_rows = sum([len(rows[year]) for year in years])

    if total_rows == 0:
        raise ValueError(f"No rows found for HARP {harpnum}")

    width, height = rows[years[0]][0][3], rows[years[0]][0][4]

    return rows, total_rows, (width, height)

In [19]:
import os
import zarr

IMAGES_FOLDER = "/home/julio/cmesrc/data/processed/cutouts/cutouts/"

problematic_harps = []
n_years = []

# List all the files in the folder
files = os.listdir(IMAGES_FOLDER)

final_harpnums = []
final_timestamps = []
final_zarr_indices = []
final_zarr_paths = []

for file in tqdm(files):

    path = os.path.join(IMAGES_FOLDER, file)

    # This is a zarr DirectoryStore so we open it
    store = zarr.DirectoryStore(path)

    # We open the zarr
    arr = zarr.open(store, mode="a") # Careful with this mode, it will overwrite the zarr

    # And we need to get the timestamps. I did something stupid in the script and now they're
    # all wrong to I need to rewrite them

    timestamps = arr.attrs["timestamps"]

    # Now, hopefully this should have the same size as the zarr
    if len(timestamps) != arr.shape[0]:
        problematic_harps.append(file)
        continue

    # Update the timestamps attribute

    arr.attrs["timestamps"] = timestamps

    store.close()

    # Now we add the data to the database
    for idx, timestamp in enumerate(timestamps):
        final_harpnums.append(file)
        final_timestamps.append(timestamp)
        final_zarr_indices.append(idx)
        final_zarr_paths.append(path)
        
# Add all the data to the database
cur.executemany("""
    INSERT INTO images_dataset (harpnum, timestamp, zarr_index, zarr_path)
    VALUES (?, ?, ?, ?)
""", zip(final_harpnums, final_timestamps, final_zarr_indices, final_zarr_paths))

conn.commit()

100%|██████████| 2017/2017 [00:01<00:00, 1776.85it/s]


In [22]:
IMAGES_FOLDER = "/home/julio/cmesrc/data/processed/cutouts/cutouts/"

# List all the files in the folder
files = os.listdir(IMAGES_FOLDER)

for file in files:

    path = os.path.join(IMAGES_FOLDER, file)

    # This is a zarr DirectoryStore so we open it
    store = zarr.DirectoryStore(path)

    # We open the zarr
    arr = zarr.open(store, mode="a") # Careful with this mode, it will overwrite the zarr

    # Read timestamps and make sure there are as many as the zarr and no duplicates
    timestamps = arr.attrs["timestamps"]

    if len(timestamps) != arr.shape[0]:
        print(file)
        raise ValueError(f"Timestamps length ({len(timestamps)}) does not match zarr shape ({arr.shape[0]})")
    if len(timestamps) != len(set(timestamps)):
        raise ValueError(f"Timestamps contains duplicates")