# Data Processing
Notebook for handling the processing of our datasets.

In [79]:
# depenencies
import numpy as np
import pandas

In [80]:
# read in datasets
verifications_2015 = pandas.read_csv("2015_pos_verified_tweets_cleaned.csv", encoding='latin-1', usecols=["st_y", "st_x", "location_country", "created_at"])
verifications_2016 = pandas.read_csv("2016_pos_verified_tweets_cleaned.csv", encoding='latin-1', usecols=["st_y", "st_x", "location_country", "created_at"])
flare_data = pandas.read_csv("flares_and_instruments_v2.csv", encoding='latin-1', usecols=["Sol", "JJJ Start", "JJJ Peak", "JJJ End", "JJJ Class", "HHH X-pos", "HHH y-pos"])
# make one big dataframe from the 2 verification dataframes
verifications = pandas.concat([verifications_2015, verifications_2016], ignore_index=True)

In [81]:
# fixing date-time formats 
# flare data has (YYYY-MM-DD)'T'(HH:MM:SS)
# verification data has (YYYY-MM-DD) (HH:MM:SS)'+00'

# remove '+00' from date-time in verification data
f = lambda x: x[:-3]
verifications['created_at'] = verifications['created_at'].apply(f)

# replace 'T' with ' ' from date-time in flare data
g = lambda x: x.replace("T", " ")
flare_data['JJJ Start'] = flare_data['JJJ Start'].apply(g)
flare_data['JJJ Peak'] = flare_data['JJJ Peak'].apply(g)
flare_data['JJJ End'] = flare_data['JJJ End'].apply(g)

# reduce Sol to only be the date in format YYYYMMDD
h = lambda x: x[:8]
flare_data['Sol'] = flare_data['Sol'].apply(h)

# add Sol column to verifications
verifications['created_at'] = pandas.to_datetime(verifications['created_at'], errors='coerce')
if verifications['created_at'].isnull().any():
    print("Warning: Some rows have invalid datetime values!")
verifications['Sol'] = verifications['created_at'].dt.strftime('%Y%m%d')

# remove everything but sol
verifications = verifications.iloc[ : , verifications.columns!='created_at']
verifications = verifications.iloc[ : , verifications.columns!='st_x']
verifications = verifications.iloc[ : , verifications.columns!='st_y']
verifications = verifications.iloc[ : , verifications.columns!='location_country']

In [82]:
flare_data['JJJ Start'] = pandas.to_datetime(flare_data['JJJ Start'])
flare_data['JJJ Peak'] = pandas.to_datetime(flare_data['JJJ Peak'])
flare_data['JJJ End'] = pandas.to_datetime(flare_data['JJJ End'])
flare_data['flare_duration'] = (flare_data['JJJ End'] - flare_data['JJJ Start']).dt.total_seconds() / 60
flare_data['flare_peak'] = (flare_data['JJJ Peak'] - flare_data['JJJ Start']).dt.total_seconds() / 60

# remove jjj start, peak, end
flare_data = flare_data.iloc[ : , flare_data.columns!='JJJ Start']
flare_data = flare_data.iloc[ : , flare_data.columns!='JJJ Peak']
flare_data = flare_data.iloc[ : , flare_data.columns!='JJJ End']

In [85]:
# convert the Sol column to datetime if needed
flare_data['Sol'] = pandas.to_datetime(flare_data['Sol'], format='%Y%m%d')

# Group by Sol and calculate the average durations and count of rows
sol_summary = flare_data.groupby('Sol').agg({
    'flare_duration': 'mean',
    'flare_peak': 'mean'
})
sol_summary['flare_count'] = flare_data.groupby('Sol').size()
sol_summary = sol_summary.reset_index()

# Rename the columns for clarity if needed
sol_summary.columns = ['Sol', 'avg_duration', 'avg_peak', 'flare_count']

In [86]:
verifications['Sol'] = pandas.to_datetime(verifications['Sol'], format='%Y%m%d')
sol_aurora_counts = verifications['Sol'].value_counts()

# Add the aurora_count column to sol_summary
sol_summary['aurora_count'] = sol_summary['Sol'].map(sol_aurora_counts).fillna(0).astype(int)

# Add the target column based on aurora_count
sol_summary['target'] = (sol_summary['aurora_count'] > 0).astype(int)

In [87]:
flare_data
sol_summary

Unnamed: 0,Sol,avg_duration,avg_peak,flare_count,aurora_count,target
0,2015-01-01,13.333333,4.333333,3,1,1
1,2015-01-02,4.500000,2.500000,6,7,1
2,2015-01-03,18.000000,11.272727,11,2,1
3,2015-01-04,27.666667,14.000000,6,6,1
4,2015-01-05,12.000000,8.500000,6,2,1
...,...,...,...,...,...,...
637,2016-12-26,14.000000,4.500000,4,1,1
638,2016-12-27,20.333333,13.000000,3,0,0
639,2016-12-29,9.000000,4.000000,1,0,0
640,2016-12-30,11.500000,5.000000,2,0,0


In [None]:
verifications

Unnamed: 0,Sol
0,2015-01-01
1,2015-01-02
2,2015-01-02
3,2015-01-02
4,2015-01-02
...,...
1335,2016-12-23
1336,2016-12-23
1337,2016-12-26
1338,2016-12-31


In [None]:
# # group verifications by date
# verifications_unique = verifications.groupby('Sol').agg(
#     {col: lambda x: tuple(x) for col in verifications.columns if col != 'Sol'}
# ).reset_index()

# # add a column for the number of verifications per date
# verifications_unique['sighting_count'] = verifications_unique['created_at'].apply(len)

# # group flare_data by date
# flare_data_unique = flare_data.groupby('Sol').agg(
#     {col: lambda x: tuple(x) for col in flare_data.columns if col != 'Sol'}
# ).reset_index()

# # add a column for the number of flares per date
# flare_data_unique['Flare Count'] = flare_data_unique['JJJ Class'].apply(len)

In [None]:
# print(verifications_unique['sighting_count'])

In [None]:
# # merge dataframes
# aurorae = pandas.merge(flare_data_unique, verifications_unique, on='Sol', how='outer')

# # Add the 'aurora_visible' column
# aurorae['aurora_visible'] = np.where(aurorae['sighting_count'].notna() & (aurorae['sighting_count'] > 0), 1, 0)

# # Set 'sightings_count' to 0 where 'aurora_visible' is 0
# aurorae.loc[aurorae['aurora_visible'] == 0, 'sighting_count'] = 0

In [None]:
# save dataset
sol_summary.to_csv('../data_processed/aurorae_minimal_dataset.csv')
np.save('../data_processed/aurorae_minimal_dataset', sol_summary)