In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
from datetime import datetime
from datetime import timedelta

<h1> A Machine Learning Model for Solar Flare Predictions </h1>

<h2>I. Overview</h2>

<h3> What is a solar flare? </h3>

A solar flare is a powerful electromagnetic radioactive eruption of the Sun. It occurs in active regions(AR) which are charactirized with magnetic abnormlities - extremely strong and complex magnetic field. A solar flare can release energy of $10^{21}$ to $10^{25}$ J. This leads to disruptions in radio communication technologies and even fallouts. For example, the biggest solar flare ever recorded, happened in 1859 and caused auroras, which could be seen from the Carribean, blackouts and failiure of most telegraph systems over North America and Europe. That is why scientist are trying to find a way to predict when an event like this would happen.

In [4]:
#date format 2022.11.01_00:00:00-2022.11.02_00:00:00
import drms

def extractHMI_SHARP_720s(year):
    
    dataFrame = pd.DataFrame()
    currDate = datetime(year, 1, 11)
    pastDate = currDate + timedelta(days = -10)

    while currDate.year < year+1:
        
        pastDateStr = pastDate.strftime('%Y.%m.%d_%H:%M:%S')
        currDateStr = currDate.strftime('%Y.%m.%d_%H:%M:%S')
        
        url = pastDateStr + '-' + currDateStr
        print(url)
        
        client = drms.Client()
        dataFrameQuery = client.query(
        'hmi.sharp_720s[][' + url + ']',
        key='T_REC,HARPNUM,TOTUSJH,TOTPOT,TOTUSJZ,ABSNJZH,SAVNCPP,USFLUX,AREA_ACR,MEANPOT,R_VALUE,SHRGT45,MEANSHR,MEANGAM,NOAA_AR,QUALITY'
        )
        
        dataFrame = pd.concat([dataFrame, dataFrameQuery], ignore_index= True)
        currDate = currDate + timedelta(days = 10)
        pastDate = pastDate + timedelta(days = 10)
        
        if pastDate.year != currDate.year:
            currDate = datetime(pastDate.year, 12, 31, 23, 59, 59)
            
    dataFrame.to_csv('data/HMI_SHARP_720s/HMI_SHARP_720s_' + str(year) + '.csv')
    


In [None]:
for i in range (2010, 2022+1):
    print ('Currently extracting data from the year ' + str(i))
    extractHMI_SHARP_720s(i)

Currently extracting data from the year 2010
2010.01.01_00:00:00-2010.01.11_00:00:00


In [5]:
#date format 2022.11.01_00:00:00-2022.11.02_00:00:00
def extractCgemLorentz(year):
    
    dataFrame = pd.DataFrame()
    currDate = datetime(year, 1, 11)
    pastDate = currDate + timedelta(days = -10)

    while currDate.year < year+1:
        
        pastDateStr = pastDate.strftime('%Y.%m.%d_%H:%M:%S')
        currDateStr = currDate.strftime('%Y.%m.%d_%H:%M:%S')
        
        url = pastDateStr + '-' + currDateStr
        #print(url)
        
        client = drms.Client()
        dataFrameQuery = client.query(
            'cgem.lorentz[][' + url + ']',
            key='T_REC,HARPNUM, TOTBSQ, TOTFZ, EPSZ, NOAA_AR, QUALITY'
        )
        
        dataFrame = pd.concat([dataFrame, dataFrameQuery], ignore_index = True)
        currDate = currDate + timedelta(days = 10)
        pastDate = pastDate + timedelta(days = 10)
        
        if pastDate.year != currDate.year:
            currDate = datetime(pastDate.year, 12, 31, 23, 59, 59)
            
    dataFrame.to_csv('data/cgemLorentz/cgemLorentz_' + str(year) + '.csv')
    


In [10]:
for i in range (2010, 2022+1):
    print ('Currently extracting data from the year ' + str(i))
    extractCgemLorentz(i)

Currently extracting data from the year 2010
Currently extracting data from the year 2011
Currently extracting data from the year 2012
Currently extracting data from the year 2013
Currently extracting data from the year 2014
Currently extracting data from the year 2015
Currently extracting data from the year 2016
Currently extracting data from the year 2017
Currently extracting data from the year 2018
Currently extracting data from the year 2019
Currently extracting data from the year 2020
Currently extracting data from the year 2021
Currently extracting data from the year 2022


In [6]:
def extractDataFromSolarMonitor(year):
    
    dataFrame = pd.DataFrame()
    
    currDate = date(year, 1, 1)
    if year == 2010:
        currDate = date(year, 8, 5)
    
    print('Currently extracting data from year ' + str(currDate.year))
    while currDate.year < year+1:
        dateStr = currDate.strftime('%Y%m%d')
        url = 'https://www.solarmonitor.org/index.php?date=' + dateStr
        #print(url)
        df = pd.read_html(url)
        for i in range(0, len(df)):
            if df[i][0][0] == "Today's/Yesterday's NOAA Active Regions":
                df = df[i]
                break

        #format table and set headers
        df = df.drop([0]).reset_index()
        df.rename(columns = df.iloc[0], inplace = True)
        df = df.drop([0]).reset_index()
        df = df.drop(columns = ['index', 1])
        df.insert(0, 'Date', currDate.strftime('%Y.%m.%d'))


        dataFrame = pd.concat([dataFrame, df], ignore_index = True)
        currDate = currDate + timedelta(days = 1)
    
    dataFrame.to_csv('data/Solar Monitor/Solar_Monitor_' + str(year) + '.csv')

In [7]:
for i in range(2013, 2022+1):
    extractDataFromSolarMonitor(i)

Currently extracting data from year 2013
Currently extracting data from year 2014
Currently extracting data from year 2015
Currently extracting data from year 2016
Currently extracting data from year 2017
Currently extracting data from year 2018
Currently extracting data from year 2019
Currently extracting data from year 2020
Currently extracting data from year 2021
Currently extracting data from year 2022
