<a href="https://colab.research.google.com/github/JD314/Proyecto-solar/blob/main/Helios.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The board is available [here](https://bit.ly/3rsCVsU)

<h1>Helios</h1>

In [None]:
!wget https://raw.githubusercontent.com/JD314/Proyecto-solar/main/training.json

--2022-10-03 14:40:27--  https://raw.githubusercontent.com/JD314/Proyecto-solar/main/training.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27015 (26K) [text/plain]
Saving to: ‘training.json’


2022-10-03 14:40:28 (15.3 MB/s) - ‘training.json’ saved [27015/27015]



#Train the model

We used the Sklearn module to apply a random forest algorithm in order to categorize the data into geomagnetic storm levels proposed by the NOA according to the KP intensity index.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from joblib import dump
import pandas as pd

cme = pd.read_json(r'/content/training.json')

x = cme[['flux', 'v', 'minv', 'maxv' ]]
y = cme["Kp"]


#scaler = StandardScaler()

x_scaled = x #scaler.fit_transform(x)

x_train,x_test, y_train, y_test = train_test_split(x_scaled, y, stratify=y, test_size=0.10, random_state=8)

classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

print("Precision:", accuracy_score(y_test,y_pred))

Precision: 0.5263157894736842


#Real time


In [None]:
# Functions to obtain near real time data from Goes 16 and Lasco

def get_lasco(year:str, month:str):
    """Get dataFrame of CME detected by Lasco catalog CACTUS of the giving date
    # Update every five days
    Arguments: year: str; month: str (in numeric format)
    Example of expected arguments: Year= '2021', month= '04' 
    OUTPUT: DataFrame"""

    import pandas as pd
    import urllib.request

    #Get data from Cactus cmecat.txt of this month
    cactus = f'https://www.sidc.be/cactus/catalog/LASCO/2_5_0/qkl/{year}/{month}/cmecat.txt'

    cmecat = urllib.request.urlopen(cactus)
    lines = []

    # -- Decodificar el txt --
    for line in cmecat:
        decoded_line = line.decode("utf-8")
        lines.append(decoded_line)

    # -- Limpiar los datos -- 
    datos = lines[26: 26+lines[26:].index(' \n')]    # Los datos inician en la fila 26 y terminan cuando aparece ' \n'
    data = {i: [j.replace('\n', '').replace('?', '').replace('#', '') for j in datos[i].split('|')] for i in range(len(datos))}

    # -- Crear dataframe auxiliar para corregir las columnas --
    df_cme = pd.DataFrame.from_dict(data, orient='Index')
    df_cme.columns = [df_cme.iloc[0][i].replace(' ', '') for i in range(df_cme.shape[1])] #Columsn tag are the first line, without spaces

    lasco = pd.read_json(df_cme.iloc[1:].to_json()).set_index('CME') #Crear el dataframe cme
    lasco['t0'] = pd.to_datetime(lasco.t0)
    lasco.rename(columns={'t0':'time_tag'}, inplace=True)

    return lasco

def get_lasco_rt():
    """Get dataFrame of CME detected by Lasco near real time data
    OUTPUT: DataFrame"""

    import pandas as pd
    import urllib.request
    
    cactus = 'https://www.sidc.be/cactus/out/cmecat.txt'
    cmecat = urllib.request.urlopen(cactus)
    lines = []

    # -- Decodificar el txt --
    for line in cmecat:
        decoded_line = line.decode("utf-8")
        lines.append(decoded_line)

    # -- Limpiar los datos -- 
    datos = lines[26: 26+lines[26:].index(' \n')]    # Los datos inician en la fila 26 y terminan cuando aparece ' \n'
    data = {i: [j.replace('\n', '').replace('?', '').replace('#', '') for j in datos[i].split('|')] for i in range(len(datos))}

    # -- Crear dataframe auxiliar para corregir las columnas --
    df_cme = pd.DataFrame.from_dict(data, orient='Index')
    df_cme.columns = [df_cme.iloc[0][i].replace(' ', '') for i in range(df_cme.shape[1])] #Columsn tag are in the first line, without spaces

    lasco = pd.read_json(df_cme.iloc[1:].to_json()).set_index('CME') #Crear el dataframe cme
    lasco['t0'] = pd.to_datetime(lasco.t0)
    lasco.rename(columns={'t0':'time_tag'}, inplace=True)
    
    return lasco

#Funciones para obtener Xray
def get_goes():
    """"Get the 7 days-real time data from xray/goes 16
    OUTPUT: DataFrame"""
    url = 'https://services.swpc.noaa.gov/json/goes/primary/xrays-7-day.json'
    xray = pd.read_json(url)

    # -- Manejo de datos temporales --
    xray.time_tag = xray['time_tag'].apply(lambda x: x.replace('T', ' ').replace('Z', ''))
    xray.time_tag = pd.to_datetime(xray.time_tag)

    return xray



In [None]:

from datetime import datetime
import pandas as pd


today = datetime.now()

#Get the real time data
cme = get_lasco_rt()
goes = get_goes()


if today.day < 3: #if it is the beginning of the month take data from the end of last month 
    
    last_date = today - pd.Timedelta(4,'d') #today - 4 days

    date = last_date.strftime("%Y-%m-%d")
    date = date.split('-')
    year, month, day = date[0], date[1], date[2]

    last_mont = get_lasco(year, month) #get the data from the last moth with lasco
    last_mont = last_mont[(last_mont.time_tag > last_date)]
    
    cme = pd.concat([cme, last_mont]) #Append the last month register to Data

# -- Append flux --
cme.time_tag = pd.to_datetime(cme.time_tag)

cme = cme[cme.da > 170] #Take the big CME detected by Lasco
CME = cme.assign(flux=0.0)


for index, row in CME.iterrows(): #iterate over all data to fill missing fields

    # -- iteración sobre cada CME --
    cme = CME.loc[index] 
    time = cme.time_tag

    #Tomar el xray flare con mayor flujo en el intervalo de tiempo que se da la EMC
    cond = pd.Timedelta(cme.dt0/10,'h') # /10 se "ajusta"
    time_min = time - cond
    time_max = time + cond

    intervalo = goes[(goes.time_tag > time_min) & (goes.time_tag < time_max)] #Intervalo de tiempo para tomar el max flux

    # -- Append flux and prediction --
    flux = intervalo.flux.max()
    CME.loc[index, 'flux'] = flux

    datos = CME[['flux', 'v', 'minv', 'maxv']]

    last_event = datos.tail(1)

    CME.loc[index, 'kp'] = classifier.predict(last_event.to_numpy())[0]



#References

<h3>Data Usage</h3>

[Automated detection of CMEs in LASCO data](https://ui.adsabs.harvard.edu/abs/2004A%26A...425.1097R/abstract) Berghmans, D.; Foing, B. H.; Fleck, B.<br>

[DSCOVR real time solar wind](https://www.swpc.noaa.gov/products/real-time-solar-wind)

[Goes Xray real time](https://www.swpc.noaa.gov/products/goes-x-ray-flux)
   