# Data Processing
Notebook for handling the processing of our datasets.

In [2]:
# depenencies
import numpy as np
import pandas

In [3]:
# read in datasets
verifications_2015 = pandas.read_csv("2015_pos_verified_tweets_cleaned.csv", encoding='latin-1', usecols=["st_y", "st_x", "location_country", "created_at"])
verifications_2016 = pandas.read_csv("2016_pos_verified_tweets_cleaned.csv", encoding='latin-1', usecols=["st_y", "st_x", "location_country", "created_at"])
flare_data = pandas.read_csv("flares_and_instruments_v2.csv", encoding='latin-1', usecols=["Sol", "JJJ Start", "JJJ Peak", "JJJ End", "JJJ Class", "HHH X-pos", "HHH y-pos"])
# make one big dataframe from the 2 verification dataframes
verifications = pandas.concat([verifications_2015, verifications_2016], ignore_index=True)

In [4]:
# fixing date-time formats 
# flare data has (YYYY-MM-DD)'T'(HH:MM:SS)
# verification data has (YYYY-MM-DD) (HH:MM:SS)'+00'

# remove '+00' from date-time in verification data
f = lambda x: x[:-3]
verifications['created_at'] = verifications['created_at'].apply(f)

# replace 'T' with ' ' from date-time in flare data
g = lambda x: x.replace("T", " ")
flare_data['JJJ Start'] = flare_data['JJJ Start'].apply(g)
flare_data['JJJ Peak'] = flare_data['JJJ Peak'].apply(g)
flare_data['JJJ End'] = flare_data['JJJ End'].apply(g)

# reduce Sol to only be the date in format YYYYMMDD
h = lambda x: x[:8]
flare_data['Sol'] = flare_data['Sol'].apply(h)

# add Sol column to verifications
verifications['created_at'] = pandas.to_datetime(verifications['created_at'], errors='coerce')
if verifications['created_at'].isnull().any():
    print("Warning: Some rows have invalid datetime values!")
verifications['Sol'] = verifications['created_at'].dt.strftime('%Y%m%d')

# remove everything but sol
verifications = verifications.iloc[ : , verifications.columns!='created_at']
verifications = verifications.iloc[ : , verifications.columns!='st_x']
verifications = verifications.iloc[ : , verifications.columns!='st_y']
verifications = verifications.iloc[ : , verifications.columns!='location_country']

In [5]:
flare_data['JJJ Start'] = pandas.to_datetime(flare_data['JJJ Start'])
flare_data['JJJ Peak'] = pandas.to_datetime(flare_data['JJJ Peak'])
flare_data['JJJ End'] = pandas.to_datetime(flare_data['JJJ End'])
flare_data['flare_duration'] = (flare_data['JJJ End'] - flare_data['JJJ Start']).dt.total_seconds() / 60
flare_data['flare_peak'] = (flare_data['JJJ Peak'] - flare_data['JJJ Start']).dt.total_seconds() / 60

# remove jjj start, peak, end
flare_data = flare_data.iloc[ : , flare_data.columns!='JJJ Start']
flare_data = flare_data.iloc[ : , flare_data.columns!='JJJ Peak']
flare_data = flare_data.iloc[ : , flare_data.columns!='JJJ End']

In [6]:
# convert the Sol column to datetime if needed
flare_data['Sol'] = pandas.to_datetime(flare_data['Sol'], format='%Y%m%d')

# Group by Sol and calculate the average durations and count of rows
sol_summary = flare_data.groupby('Sol').agg({
    'flare_duration': 'mean',
    'flare_peak': 'mean'
})
sol_summary['flare_count'] = flare_data.groupby('Sol').size()
sol_summary = sol_summary.reset_index()

# Rename the columns for clarity if needed
sol_summary.columns = ['Sol', 'avg_duration', 'avg_peak', 'flare_count']

In [7]:
verifications['Sol'] = pandas.to_datetime(verifications['Sol'], format='%Y%m%d')
sol_aurora_counts = verifications['Sol'].value_counts()

# Add the aurora_count column to sol_summary
sol_summary['aurora_count'] = sol_summary['Sol'].map(sol_aurora_counts).fillna(0).astype(int)

# Add the target column based on aurora_count
sol_summary['target'] = (sol_summary['aurora_count'] > 0).astype(int)

In [8]:
# save dataset
# sol_summary.to_csv('../data_processed/aurorae_minimal_dataset.csv')
# np.save('../data_processed/aurorae_minimal_dataset', sol_summary)

In [20]:
column_names = [
    'YEAR', 
    'DOY', 
    'Hour', 
    'BZ_nT_GSE', 
    'BZ_nT_GSM', 
    'SW_Plasma_Speed_kms', 
    'Kp_index', 
    'R_Sunspot_No', 
    'Dst_index_nT', 
    'f10_7_index', 
    'AE_index_nT'
]
omni = pandas.read_fwf('nasaOMNI_data.lst', header=None, names=column_names)
omni['Sol'] = pandas.to_datetime(omni['YEAR'].astype(str) + omni['DOY'].astype(str).str.zfill(3), format='%Y%j')
omni = omni.iloc[ : , omni.columns!='YEAR']
omni = omni.iloc[ : , omni.columns!='DOY']
omni = omni.iloc[ : , omni.columns!='Hour']

In [21]:
omni

Unnamed: 0,BZ_nT_GSE,BZ_nT_GSM,SW_Plasma_Speed_kms,Kp_index,R_Sunspot_No,Dst_index_nT,f10_7_index,AE_index_nT,Sol
0,-0.5,-0.6,498.0,13,104,-11,132.9,99,2015-01-01
1,-1.9,-2.1,419.0,23,122,-17,141.0,251,2015-01-02
2,1.4,1.4,447.0,23,102,-26,143.8,138,2015-01-03
3,-4.4,-4.4,410.0,30,105,-44,144.8,439,2015-01-04
4,0.6,0.3,494.0,27,88,-41,137.2,244,2015-01-05
...,...,...,...,...,...,...,...,...,...
726,-0.1,-0.0,586.0,23,16,-16,71.3,166,2016-12-27
727,-0.2,-0.2,462.0,13,12,-10,70.9,81,2016-12-28
728,-1.2,-1.1,389.0,10,11,-10,70.9,129,2016-12-29
729,-0.4,-0.2,338.0,7,11,-4,71.2,50,2016-12-30


In [22]:
omni_sol_summary = pandas.merge(sol_summary, omni, on="Sol", how="left")

In [23]:
omni_sol_summary

Unnamed: 0,Sol,avg_duration,avg_peak,flare_count,aurora_count,target,BZ_nT_GSE,BZ_nT_GSM,SW_Plasma_Speed_kms,Kp_index,R_Sunspot_No,Dst_index_nT,f10_7_index,AE_index_nT
0,2015-01-01,13.333333,4.333333,3,1,1,-0.5,-0.6,498.0,13,104,-11,132.9,99
1,2015-01-02,4.500000,2.500000,6,7,1,-1.9,-2.1,419.0,23,122,-17,141.0,251
2,2015-01-03,18.000000,11.272727,11,2,1,1.4,1.4,447.0,23,102,-26,143.8,138
3,2015-01-04,27.666667,14.000000,6,6,1,-4.4,-4.4,410.0,30,105,-44,144.8,439
4,2015-01-05,12.000000,8.500000,6,2,1,0.6,0.3,494.0,27,88,-41,137.2,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,2016-12-26,14.000000,4.500000,4,1,1,-1.0,-0.9,673.0,33,0,-22,71.4,321
638,2016-12-27,20.333333,13.000000,3,0,0,-0.1,-0.0,586.0,23,16,-16,71.3,166
639,2016-12-29,9.000000,4.000000,1,0,0,-1.2,-1.1,389.0,10,11,-10,70.9,129
640,2016-12-30,11.500000,5.000000,2,0,0,-0.4,-0.2,338.0,7,11,-4,71.2,50


In [None]:
# save dataset
omni_sol_summary.to_csv('../data_processed/nasaOMNI_with_flares.csv')
np.save('../data_processed/nasaOMNI_with_flares', omni_sol_summary)