In [1]:
# modules
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

%matplotlib inline

# ignore unnecessary
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load the data
nasa_csv = pd.read_csv('nasa.csv', index_col=0)
space_csv = pd.read_csv('space.csv', index_col=0)

In [3]:
nasa_csv.head()

Unnamed: 0,Start_Datetime,End_Datetime,startFrequency,endFrequency,flare_Location,flare_region,importance,CME_Date,CME_Time,width,speed,CPA,is_halo,lower_bound
0,1997-04-01 14:00:00,1997-04-01 14:15:00,8000,4000,S25E16,8026,M1.3,04/01,15:18,79.0,312.0,74,False,False
1,1997-04-07 14:30:00,1997-04-07 17:30:00,11000,1000,S28E19,8027,C6.8,04/07,14:27,360.0,878.0,na,True,False
2,1997-05-12 05:15:00,1997-05-14 16:00:00,12000,80,N21W08,8038,C1.3,05/12,05:30,360.0,464.0,na,True,False
3,1997-05-21 20:20:00,1997-05-21 22:00:00,5000,500,N05W12,8040,M1.3,05/21,21:00,165.0,296.0,263,False,False
4,1997-09-23 21:53:00,1997-09-23 22:16:00,6000,2000,S29E25,8088,C1.4,09/23,22:02,155.0,712.0,133,False,False


In [4]:
len(nasa_csv)

482

In [5]:
space_csv.tail()

Unnamed: 0,Rank,X_class,Region,Start_time,Max_time,End_time
45,46,X2.7,2339,2015-05-05 22:05:00,2015-05-05 22:11:00,2015-05-05 22:15:00
46,47,X2.7,488,2003-11-03 01:09:00,2003-11-03 01:30:00,2003-11-03 01:45:00
47,48,X2.7,8210,1998-05-06 07:58:00,1998-05-06 08:09:00,1998-05-06 08:20:00
48,49,X2.6,720,2005-01-15 22:25:00,2005-01-15 23:02:00,2005-01-15 23:31:00
49,50,X2.6,9632,2001-09-24 09:32:00,2001-09-24 10:38:00,2001-09-24 11:09:00


In [6]:
len(space_csv)

50

## Getting Data Ready for Analysis
- Removing non-numerical items from numerical columns
- Types conversion 
- unifying attributes
 - NASA flare region has 5 digits. e.g. 10486
 - Space flare region has 4 digits. e.g. 0486

### Seperate importance 
- Seperate the importance column into 2 columns
 - importance_1: character representing the **Solar Flare** class
 - importance_2: float representing the class value

In [7]:
def seperateImportance(df, col_name='importance', numbers=True, letters=True):
    if col_name == 'X_class':
        # remove the char in the last
        df['X_class'] = df['X_class'].apply(lambda x: x[:-1] if x[-1] == '+' else x)

    if letters:
        df['importance_1'] = df[col_name].str.slice(start=0, stop=1)
    if numbers:
        df['importance_2'] = df[col_name].str.slice(start=1).astype('float')
    return df

In [8]:
# Cast attributes to datetime stamp 
def to_datetime(df, nasa=False, space=False):
    if nasa:
        df.Start_Datetime = pd.to_datetime(df.Start_Datetime)
        df.End_Datetime = pd.to_datetime(df.End_Datetime)
        df.CME_Time = pd.to_datetime(df.CME_Time)
    if space:
        df.Start_time = pd.to_datetime(df.Start_time)
        df.Max_time = pd.to_datetime(df.Max_time)
        df.End_time = pd.to_datetime(df.End_time)
    
    return df

In [9]:
# Cast time attributes to datetime stamp
nasa_csv = to_datetime(nasa_csv, nasa=True)
space_csv = to_datetime(space_csv, space=True)

In [10]:
# replacing 'non-numerical values' with nan in the importance column
nasa_csv = nasa_csv[nasa_csv.importance != 'FILA']

# replacing 'non-numerical values' with nan in the region column
nasa_csv.flare_region = nasa_csv.flare_region.str.extract('(\d+)', expand=False)

In [11]:
# cast flare region to float
nasa_csv.flare_region = nasa_csv.flare_region.astype('float')
# change NASA's flare region from 5 to 4 digits
nasa_csv.flare_region = nasa_csv.flare_region.apply(lambda x:x-10000 if x>10000 else x)

# preprocess space flare region
space_csv.Region = space_csv.Region.astype('float')

In [12]:
# show data types
space_csv.dtypes

Rank                   int64
X_class               object
Region               float64
Start_time    datetime64[ns]
Max_time      datetime64[ns]
End_time      datetime64[ns]
dtype: object

In [13]:
nasa_csv.dtypes

Start_Datetime    datetime64[ns]
End_Datetime      datetime64[ns]
startFrequency            object
endFrequency              object
flare_Location            object
flare_region             float64
importance                object
CME_Date                  object
CME_Time          datetime64[ns]
width                    float64
speed                    float64
CPA                       object
is_halo                     bool
lower_bound                 bool
dtype: object

# Part 2 Q1: Replication  

In [14]:
# seperate importance
nasa_csv = seperateImportance(nasa_csv, 'importance')

### Sorting the dataframe according to the importance of the Solar Flare
- First, it sorts using `importance_1` values so the character `X` is put at the top of the table
- Second, it sorts using `importance_2` values so the flares with the highes value are put at the top

In [15]:
nasa_csv = nasa_csv.sort_values(['importance_1', 'importance_2'], ascending=False)

In [16]:
# drop the importance_1, importance_2 columns
nasa_csv.drop(['importance_1', 'importance_2'], axis=1, inplace=True)

In [17]:
# showing top 3
nasa_csv[:3]

Unnamed: 0,Start_Datetime,End_Datetime,startFrequency,endFrequency,flare_Location,flare_region,importance,CME_Date,CME_Time,width,speed,CPA,is_halo,lower_bound
242,2003-11-04 20:00:00,2003-11-05 00:00:00,10000,200,S19W83,486.0,X28.,11/04,2020-03-26 19:54:00,360.0,2657.0,na,True,False
119,2001-04-02 22:05:00,2001-04-03 02:30:00,14000,250,N19W72,9393.0,X20.,04/02,2020-03-26 22:06:00,244.0,2505.0,261,False,False
234,2003-10-28 11:10:00,2003-10-30 00:00:00,14000,40,S16E08,486.0,X17.,10/28,2020-03-26 11:30:00,360.0,2459.0,na,True,False


### Replication Analysis

In [18]:
# get the top 50 of nasa
nasa_50 = nasa_csv[:50]

### Replication Criteria 
- in Both NASA and SPACE dataframes
 1. Get all flares that happened in the same region
 2. Get all flares that have the same starting time 
 3. Check the number of rows
 4. Check the mean of the flare `X_class` and `importance`

In [19]:
dicOfMatch={}
for s_row in space_csv.itertuples(index=False):
    same_region = []
    # checking the same region
    for n_row in nasa_50.itertuples(index=False):
        if s_row.Region == n_row.flare_region:
             same_region.append(n_row)

    # chacking the same starting date[year, month, day]
    for n_row in same_region:
        if n_row.Start_Datetime.date()==s_row.Start_time.date():
            dicOfMatch[s_row]=n_row
print(f'Number of matching rows: {len(dicOfMatch)}')

Number of matching rows: 32


In [20]:
match_nasa = pd.DataFrame(dicOfMatch.values())
match_nasa[:5]

Unnamed: 0,Start_Datetime,End_Datetime,startFrequency,endFrequency,flare_Location,flare_region,importance,CME_Date,CME_Time,width,speed,CPA,is_halo,lower_bound
0,2003-11-04 20:00:00,2003-11-05 00:00:00,10000,200,S19W83,486.0,X28.,11/04,2020-03-26 19:54:00,360.0,2657.0,na,True,False
1,2001-04-02 22:05:00,2001-04-03 02:30:00,14000,250,N19W72,9393.0,X20.,04/02,2020-03-26 22:06:00,244.0,2505.0,261,False,False
2,2003-10-28 11:10:00,2003-10-30 00:00:00,14000,40,S16E08,486.0,X17.,10/28,2020-03-26 11:30:00,360.0,2459.0,na,True,False
3,2001-04-15 14:05:00,2001-04-16 13:00:00,14000,40,S20W85,9415.0,X14.,04/15,2020-03-26 14:06:00,167.0,1199.0,245,False,False
4,2003-10-29 20:55:00,2003-10-30 00:00:00,11000,500,S15W02,486.0,X10.,10/29,2020-03-26 20:54:00,360.0,2029.0,na,True,False


In [21]:
match_space = pd.DataFrame(dicOfMatch.keys())
match_space[:5]

Unnamed: 0,Rank,X_class,Region,Start_time,Max_time,End_time
0,1,X28+,486.0,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00
1,2,X20+,9393.0,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00
2,3,X17.2+,486.0,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00
3,5,X14.4,9415.0,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00
4,6,X10,486.0,2003-10-29 20:37:00,2003-10-29 20:49:00,2003-10-29 21:01:00


In [22]:
match_nasa = seperateImportance(match_nasa, letters=False)
match_space = seperateImportance(match_space, 'X_class', letters=False)

In [23]:
# getting the mean
print("Mean:", (abs(match_nasa.importance_2 - match_space.importance_2)).mean())

Mean: 0.19374999999999998


### Conclusion Analysis 
- When doing the matching using the region where the flare happened and the date of the region
 - we only get 32 matching rows
 - the mean differnce between the NASA's `importance` and SPACE `X_class` is: `0.19374999999999998`
- So based on that the 2 datasets have slightly different values for the same Flare event, so we won't be able to replicate the whole data with high accuracy

# Part 2 Q2: Integration  

- get the common attributes
 - Start Time, End Stime, Region, CME Time, Importance
- define the matching criteria

In [24]:
# get smaller version of the 2 dataframes with only the common columns
nasa_50_small = nasa_50[['importance', 'flare_region', 'Start_Datetime', 'End_Datetime', 'CME_Time']]

In [25]:
nasa_50_small.head()

Unnamed: 0,importance,flare_region,Start_Datetime,End_Datetime,CME_Time
242,X28.,486.0,2003-11-04 20:00:00,2003-11-05 00:00:00,2020-03-26 19:54:00
119,X20.,9393.0,2001-04-02 22:05:00,2001-04-03 02:30:00,2020-03-26 22:06:00
234,X17.,486.0,2003-10-28 11:10:00,2003-10-30 00:00:00,2020-03-26 11:30:00
128,X14.,9415.0,2001-04-15 14:05:00,2001-04-16 13:00:00,2020-03-26 14:06:00
235,X10.,486.0,2003-10-29 20:55:00,2003-10-30 00:00:00,2020-03-26 20:54:00


### Criteria
1. Region -> Matching(Equality)
2. get the row with the minimum starting date

In [44]:
def bestMatching(df1=space_csv, df2=nasa_50_small):
    # seperate the value of the X-class/Importance from space
    df1 = seperateImportance(df1, 'X_class', letters=False)
    df2 = seperateImportance(df2, letters=False)

    nasa_csv['Space_Rank'] = ""

    rows = []
    rank = {}
    for row in df1.itertuples(index=True):
        near_x = {}
        for r in df2.itertuples(index=True):
            near_x[abs(r.importance_2 - row.importance_2)] = r
        if len(near_x) > 0:
            best_match = near_x[min(near_x.keys())] 
            rows.append(best_match)
            nasa_csv.Space_Rank.loc[best_match.Index] = nasa_csv.Space_Rank.loc[best_match.Index]+str(row.Rank)+' '
        # calc mean error
        mean_error = abs(pd.DataFrame(rows).importance_2 - df1.importance_2).mean()

    
    
    return pd.DataFrame(rows).drop('importance_2', axis=1), mean_error

In [45]:
nasa_best_matching, mean_error = bestMatching(space_csv, nasa_50)
print(f'Mean Error: {mean_error}')
nasa_best_matching

Mean Error: 0.020000000000000007


Unnamed: 0,Index,Start_Datetime,End_Datetime,startFrequency,endFrequency,flare_Location,flare_region,importance,CME_Date,CME_Time,width,speed,CPA,is_halo,lower_bound
0,242,2003-11-04 20:00:00,2003-11-05 00:00:00,10000,200,S19W83,486.0,X28.,11/04,2020-03-26 19:54:00,360.0,2657.0,na,True,False
1,119,2001-04-02 22:05:00,2001-04-03 02:30:00,14000,250,N19W72,9393.0,X20.,04/02,2020-03-26 22:06:00,244.0,2505.0,261,False,False
2,234,2003-10-28 11:10:00,2003-10-30 00:00:00,14000,40,S16E08,486.0,X17.,10/28,2020-03-26 11:30:00,360.0,2459.0,na,True,False
3,234,2003-10-28 11:10:00,2003-10-30 00:00:00,14000,40,S16E08,486.0,X17.,10/28,2020-03-26 11:30:00,360.0,2459.0,na,True,False
4,128,2001-04-15 14:05:00,2001-04-16 13:00:00,14000,40,S20W85,9415.0,X14.,04/15,2020-03-26 14:06:00,167.0,1199.0,245,False,False
5,235,2003-10-29 20:55:00,2003-10-30 00:00:00,11000,500,S15W02,486.0,X10.,10/29,2020-03-26 20:54:00,360.0,2029.0,na,True,False
6,8,1997-11-06 12:20:00,1997-11-07 08:30:00,14000,100,S18W63,8100.0,X9.4,11/06,2020-03-26 12:10:00,360.0,1556.0,na,True,False
7,8,1997-11-06 12:20:00,1997-11-07 08:30:00,14000,100,S18W63,8100.0,X9.4,11/06,2020-03-26 12:10:00,360.0,1556.0,na,True,False
8,330,2006-12-05 10:50:00,2006-12-05 20:00:00,14000,250,S07E68,930.0,X9.0,,NaT,,,,False,False
9,238,2003-11-02 17:30:00,2003-11-03 01:00:00,12000,250,S14W56,486.0,X8.3,11/02,2020-03-26 17:30:00,360.0,2598.0,na,True,False


In [46]:
nasa_csv

Unnamed: 0,Start_Datetime,End_Datetime,startFrequency,endFrequency,flare_Location,flare_region,importance,CME_Date,CME_Time,width,speed,CPA,is_halo,lower_bound,Space_Rank
242,2003-11-04 20:00:00,2003-11-05 00:00:00,10000,200,S19W83,486.0,X28.,11/04,2020-03-26 19:54:00,360.0,2657.0,na,True,False,1
119,2001-04-02 22:05:00,2001-04-03 02:30:00,14000,250,N19W72,9393.0,X20.,04/02,2020-03-26 22:06:00,244.0,2505.0,261,False,False,2
234,2003-10-28 11:10:00,2003-10-30 00:00:00,14000,40,S16E08,486.0,X17.,10/28,2020-03-26 11:30:00,360.0,2459.0,na,True,False,3 4
128,2001-04-15 14:05:00,2001-04-16 13:00:00,14000,40,S20W85,9415.0,X14.,04/15,2020-03-26 14:06:00,167.0,1199.0,245,False,False,5
235,2003-10-29 20:55:00,2003-10-30 00:00:00,11000,500,S15W02,486.0,X10.,10/29,2020-03-26 20:54:00,360.0,2029.0,na,True,False,6
8,1997-11-06 12:20:00,1997-11-07 08:30:00,14000,100,S18W63,8100.0,X9.4,11/06,2020-03-26 12:10:00,360.0,1556.0,na,True,False,7 8
330,2006-12-05 10:50:00,2006-12-05 20:00:00,14000,250,S07E68,930.0,X9.0,,NaT,,,,False,False,9
238,2003-11-02 17:30:00,2003-11-03 01:00:00,12000,250,S14W56,486.0,X8.3,11/02,2020-03-26 17:30:00,360.0,2598.0,na,True,False,10 11
290,2005-01-20 07:15:00,2005-01-20 16:30:00,14000,25,N14W61,720.0,X7.1,01/20,2020-03-26 06:54:00,360.0,882.0,na,True,False,12
360,2011-08-09 08:20:00,2011-08-09 08:35:00,16000,4000,N17W69,1263.0,X6.9,08/09,2020-03-26 08:12:00,360.0,1610.0,na,True,False,13


# Plotting 

In [None]:
colors_list = ['#5cb85c','#d9534f']

## Preprocessing 

In [None]:
# LOAD NASA CSV
nasa_csv = pd.read_csv('nasa.csv', index_col=0)

In [None]:
nasa_csv.head()

In [None]:
nasa_csv.dtypes

# Halo Proportion: Nasa vs Nasa Top 50


In [None]:
nasa_50 = to_datetime(nasa_50)

In [None]:
nasa_50.dtypes

In [None]:
nasa_50.head()

In [None]:
# no 2
sums = [nasa_50.is_halo.sum(), nasa_csv.is_halo.sum()]
fig = plt.figure(figsize=(7, 5))
ax = fig.add_axes([0, 0, 1, 1])
ax.bar(['Top 50', 'Nasa'], sums, color=colors_list)
ax.set_title('Number of Halos', fontsize=20)
ax.set_ylabel('# Halos', fontsize=16)
ax.set_xlabel('Data Frame', fontsize=16)
plt.show()

In [None]:
nasa_50.Start_Datetime.iloc[0].month

In [None]:
flares_num = {}
for t in nasa_50.Start_Datetime:
    if t.month in flares_num:
        flares_num[t.month] += 1
    else:
        flares_num[t.month] = 1

In [None]:
def flares_per_month(df):
    flares_num = {}
    for t in df.Start_Datetime:
        if t.month in flares_num:
            flares_num[t.month] += 1
        else:
            flares_num[t.month] = 1
            
    return flares_num

In [None]:
flares_num_50 = flares_per_month(nasa_50)

In [None]:
flares_num_50

In [None]:
len(flares_num_50)

In [None]:
plt.bar(flares_num_50.keys(), flares_num_50.values(), color=colors_list[0])
plt.xticks(range(1,13))
plt.show()

In [None]:
flares_num_all = flares_per_month(nasa_csv)
len(flares_num_all)

In [None]:
flares_num_all

In [None]:
plt.bar(flares_num_all.keys(), flares_num_all.values() ,color=colors_list[1])
plt.xticks(range(1,13))
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.bar(flares_num_all.keys(), flares_num_all.values(), color=colors_list[1], width=0.35)
ax.bar(np.array(list(flares_num_50.keys())) + .35, flares_num_50.values(), color=colors_list[0], width=0.35)
ax.set_ylabel('Number of Flares', fontsize=16)
ax.set_xlabel('Month', fontsize=16)
ax.set_title('Number of Flares per months', fontsize=20)
ax.set_xticks(range(1, 13))
ax.legend(labels=['Full Nasa', 'Top 50'], fontsize=16)
ax.grid(True, alpha=0.5, ls='--')
plt.show()