In [1]:
import sys
sys.path.append('../')

from src.cmesrc.config import SANJIV_CATALOGUE, HARPNUM_TO_NOAA, CMESRCV2_DB
import pandas as pd
import sqlite3

In [2]:
sanjiv = pd.read_csv(SANJIV_CATALOGUE)

In [3]:
possible_comments = set(sanjiv["comments"].values)
possible_comments

{' Clear',
 'AR link not clear',
 'Clear',
 'Double check',
 'Double check needed',
 'Double check, Very weak velocity',
 'Very Clear Event',
 'Very clear event',
 'chk AR no',
 'clear',
 'clear but large scale event',
 'clear, take both ARs',
 'double check',
 'flare is not realy visible',
 'location inconsistent',
 nan,
 'no data on solar monitor',
 'not so clear',
 'very Clear Event',
 'very clear event'}

In [4]:
accepted_comments = [
    " Clear",
    "Clear",
    "Very Clear Event",
    "Very clear event",
    "clear",
]

In [5]:
# Get only those with comment being one of the accepted comments
# Also drop nan values of AR

clear_sanjiv = sanjiv[sanjiv["comments"].isin(accepted_comments)]
clear_sanjiv

Unnamed: 0,cme_date,cme_width,cme_speed,flare_time,Class,AR,comments
21,24/03/2011 17:48,36,193,24/03/2011 17:01,C9.1,11176,clear
22,27/04/2011 02:36,257,924,27/04/2011 02:26,C2.0,11201,Very Clear Event
23,12/05/2011 13:25,95,274,12/05/2011 12:11,C2.0,unnamed,Very Clear Event
24,16/05/2011 00:12,50,265,15/05/2011 23:25,C4.8,11208,clear
26,01/06/2011 18:36,189,361,01/06/2011 16:51,C4.1,11226,Very Clear Event
...,...,...,...,...,...,...,...
269,23/03/2014 03:36,360,820,23/03/2014 02:27,C3.1,12014,clear
270,28/03/2014 23:48,138,514,28/03/2014 23:44,M2.6,12017,clear
271,29/03/2014 02:48,117,414,29/03/2014 01:54,C2.4,12024,Very clear event
272,29/03/2014 18:12,360,528,29/03/2014 17:35,X1.0,12017,Very clear event


In [6]:
clear_sanjiv.sort_values(by="cme_width")

Unnamed: 0,cme_date,cme_width,cme_speed,flare_time,Class,AR,comments
195,26/05/2013 11:12,31,340,26/05/2013 09:43,C3.8,11756,clear
64,26/11/2011 00:00,31,642,26/11/2011 16:14,C1.1,11354,clear
51,26/09/2011 15:12,31,240,26/09/2011 14:37,M2.6,11302,clear
158,17/11/2012 18:48,33,361,17/11/2012 18:02,C2.8,11613,clear
260,03/02/2014 20:24,33,273,03/02/2014 19:28,C6.9,11967,clear
...,...,...,...,...,...,...,...
272,29/03/2014 18:12,360,528,29/03/2014 17:35,X1.0,12017,Very clear event
48,24/09/2011 19:36,360,972,2011/09/24 19:09,M3.0,11302,Very Clear Event
95,05/04/2012 21:25,360,828,05/04/2012 20:49,C1.5,11450,Very Clear Event
81,07/03/2012 00:24,360,2684,07/03/2012 00:02,X5.4,11429,Very Clear Event


In [7]:
invalid_ars = []
def process_ar(ar):
    global invalid_ars
    try:
        processed_ar = int(ar)
    except ValueError:
        invalid_ars.append(ar)
        processed_ar = None

    return processed_ar

clear_sanjiv["processed_ar"] = clear_sanjiv["AR"].apply(process_ar)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clear_sanjiv["processed_ar"] = clear_sanjiv["AR"].apply(process_ar)


In [8]:
# Now before adding to db we need to process the NOAA to HARPNUM mapping

noaatoharp = pd.read_csv(HARPNUM_TO_NOAA, sep=" ", header=0)
noaatoharp

Unnamed: 0,HARPNUM,NOAA_ARS
0,1,11067
1,2,11064
2,6,11065
3,8,11069
4,10,11066
...,...,...
1847,9822,13376
1848,9825,13375
1849,9832,13374
1850,9835,13377


In [9]:
def parse_noaa_lists(noaa_list):
    return [int(noaa) for noaa in noaa_list.split(",")]

noaatoharp["noaa_list"] = noaatoharp["NOAA_ARS"].apply(parse_noaa_lists)
noaatoharp

Unnamed: 0,HARPNUM,NOAA_ARS,noaa_list
0,1,11067,[11067]
1,2,11064,[11064]
2,6,11065,[11065]
3,8,11069,[11069]
4,10,11066,[11066]
...,...,...,...
1847,9822,13376,[13376]
1848,9825,13375,[13375]
1849,9832,13374,[13374]
1850,9835,13377,[13377]


In [98]:
conn = sqlite3.connect(CMESRCV2_DB)
cur = conn.cursor()
conn.execute("PRAGMA foreign_keys = ON")

<sqlite3.Cursor at 0x7fa1e2f7c640>

In [99]:
cur.execute("""DROP TABLE IF EXISTS NOAA_HARPNUM_MAPPING""")

cur.execute("""
    CREATE TABLE NOAA_HARPNUM_MAPPING (
        noaa INTEGER,
        harpnum INTEGER REFERENCES HARPS (harpnum),
        PRIMARY KEY (noaa, harpnum)
        )
""")

<sqlite3.Cursor at 0x7fa1e2f7c5c0>

In [100]:
for _, row in noaatoharp.iterrows():
    for noaa in row["noaa_list"]:
        if int(row["HARPNUM"]) > 7331.5:
            continue
        try:
            cur.execute("""
                INSERT INTO NOAA_HARPNUM_MAPPING (noaa, harpnum)
                VALUES (?, ?)
            """, (int(noaa), int(row["HARPNUM"])))
        except sqlite3.IntegrityError as e:
            print(f"Integrity error for {noaa}, {row['HARPNUM']}")

In [103]:
cur.execute("""
CREATE TABLE NOAAS (
    noaa INTEGER PRIMARY KEY
)
""")

cur.execute("""
INSERT INTO NOAAS (noaa)
SELECT DISTINCT noaa
FROM NOAA_HARPNUM_MAPPING
""")

<sqlite3.Cursor at 0x7fa1e2f7c5c0>

In [104]:
conn.commit()

In [105]:
clear_sanjiv.columns

Index(['cme_date', 'cme_width', 'cme_speed', 'flare_time', 'Class', 'AR',
       'comments', 'processed_ar'],
      dtype='object')

In [106]:
cur.execute("""DROP TABLE IF EXISTS SANJIV_SRC""")
conn.commit()

In [111]:
cur.execute("""DROP TABLE IF EXISTS SANJIV_SRC""")
cur.execute("""
    CREATE TABLE SANJIV_SRC (
        cme_id INTEGER PRIMARY KEY REFERENCES CMES (cme_id),
        cme_date TEXT,
        cme_width REAL,
        cme_speed REAL,
        flare_time TEXT,
        flare_class TEXT,
        noaa INTEGER REFERENCES NOAAS (noaa)
        )
""")
conn.commit()

In [112]:
from astropy.time import Time
import datetime
import numpy as np

diffs = []
conn.execute("PRAGMA foreign_keys = ON")

for idx, row in clear_sanjiv.dropna(subset=["processed_ar"]).iterrows():
    # First step, matching the CME to a cme_id

    cme_date = row["cme_date"]
    cme_width = row["cme_width"]

    # Try to parse the date that has format dd/mm/yyyy hh:mm
    # using datetime

    try:
        cme_date = datetime.datetime.strptime(cme_date, "%d/%m/%Y %H:%M")
        cme_date = Time(cme_date)
    except ValueError:
        print(f"Could not parse date {cme_date} for row {idx}")
    
    # Now we need to query for the closest cmes to this
    target_timestamp = cme_date.strftime("%Y-%m-%d %H:%M:%S")

    query = """
        SELECT cme_id, cme_date, cme_width,
        ABS(strftime('%s', cme_date) - strftime('%s', ?)) AS diff
        FROM CMES
        WHERE diff <= 60
    """

    df = pd.read_sql_query(query, conn, params=(target_timestamp,))

    if len(df) == 0:
        print(f"No CME found for date {cme_date}")
        continue

    df["width_diff"] = np.abs(df["cme_width"] - cme_width)

    if min(df["width_diff"]) > 1:
        print(f"No CME found for date {cme_date}")
        continue

    df = df.sort_values(by="width_diff", ascending=True)

    diffs.append([df.iloc[0]["diff"], df.iloc[0]["width_diff"]])

    matching_cme_id = df.iloc[0]["cme_id"]

    # Now similarly for the flare

    flare_class = row["Class"]

    flare_date = row["flare_time"]
    # Strip of spaces to the sides
    flare_date = flare_date.strip()
    # And if last character is a : remove it
    if flare_date[-1] == ":":
        flare_date = flare_date[:-1]

    try:
        flare_date = datetime.datetime.strptime(flare_date, "%d/%m/%Y %H:%M")
        flare_date = Time(flare_date)
    except ValueError:
        # Could be in yyyy/mm/dd hh:mm format
        try:
            flare_date = datetime.datetime.strptime(flare_date, "%Y/%m/%d %H:%M")
            flare_date = Time(flare_date)
        except ValueError:
            print(f"Could not parse date {flare_date} for row {idx}")
    
    # Add to database, formatting dates as yyyy-mm-dd hh:mm:ss

    try:
        cur.execute("""
            INSERT INTO SANJIV_SRC (cme_id, cme_date, cme_width, cme_speed, flare_time, flare_class, noaa)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """, (int(matching_cme_id), cme_date.strftime("%Y-%m-%d %H:%M:%S"), float(cme_width), int(row["cme_speed"]), str(flare_date.strftime("%Y-%m-%d %H:%M:%S")), str(flare_class), int(row["processed_ar"])))
    except sqlite3.OperationalError as e:
        raise e

conn.commit()

No CME found for date 2011-11-26 00:00:00


In [57]:
clear_sanjiv

Unnamed: 0,cme_date,cme_width,cme_speed,flare_time,Class,AR,comments,processed_ar
21,24/03/2011 17:48,36,193,24/03/2011 17:01,C9.1,11176,clear,11176.0
22,27/04/2011 02:36,257,924,27/04/2011 02:26,C2.0,11201,Very Clear Event,11201.0
23,12/05/2011 13:25,95,274,12/05/2011 12:11,C2.0,unnamed,Very Clear Event,
24,16/05/2011 00:12,50,265,15/05/2011 23:25,C4.8,11208,clear,11208.0
26,01/06/2011 18:36,189,361,01/06/2011 16:51,C4.1,11226,Very Clear Event,11226.0
...,...,...,...,...,...,...,...,...
269,23/03/2014 03:36,360,820,23/03/2014 02:27,C3.1,12014,clear,12014.0
270,28/03/2014 23:48,138,514,28/03/2014 23:44,M2.6,12017,clear,12017.0
271,29/03/2014 02:48,117,414,29/03/2014 01:54,C2.4,12024,Very clear event,12024.0
272,29/03/2014 18:12,360,528,29/03/2014 17:35,X1.0,12017,Very clear event,12017.0
