In [520]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

In [521]:
data = pd.read_csv("dataset.csv", na_values = "?")

# Data Preparation (Categorical Data)

In [522]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   FlightNumber    90 non-null     int64  
 1   Date            90 non-null     object 
 2   BoosterVersion  90 non-null     object 
 3   PayloadMass     90 non-null     float64
 4   Orbit           90 non-null     object 
 5   LaunchSite      90 non-null     object 
 6   Outcome         90 non-null     object 
 7   Flights         90 non-null     int64  
 8   GridFins        90 non-null     bool   
 9   Reused          90 non-null     bool   
 10  Legs            90 non-null     bool   
 11  LandingPad      64 non-null     object 
 12  Block           90 non-null     float64
 13  ReusedCount     90 non-null     int64  
 14  Serial          90 non-null     object 
 15  Longitude       90 non-null     float64
 16  Latitude        90 non-null     float64
 17  Class           90 non-null     int64

## Find Categorical Data

### Data Format Correction

In [523]:
data["Class"] = data["Class"].astype("bool")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   FlightNumber    90 non-null     int64  
 1   Date            90 non-null     object 
 2   BoosterVersion  90 non-null     object 
 3   PayloadMass     90 non-null     float64
 4   Orbit           90 non-null     object 
 5   LaunchSite      90 non-null     object 
 6   Outcome         90 non-null     object 
 7   Flights         90 non-null     int64  
 8   GridFins        90 non-null     bool   
 9   Reused          90 non-null     bool   
 10  Legs            90 non-null     bool   
 11  LandingPad      64 non-null     object 
 12  Block           90 non-null     float64
 13  ReusedCount     90 non-null     int64  
 14  Serial          90 non-null     object 
 15  Longitude       90 non-null     float64
 16  Latitude        90 non-null     float64
 17  Class           90 non-null     bool 

In [524]:
cat_data = data.select_dtypes(include = {object, bool})
cat_data.head()

Unnamed: 0,Date,BoosterVersion,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
0,2010-06-04,Falcon 9,LEO,CCAFS SLC 40,None None,False,False,False,,B0003,False
1,2012-05-22,Falcon 9,LEO,CCAFS SLC 40,None None,False,False,False,,B0005,False
2,2013-03-01,Falcon 9,ISS,CCAFS SLC 40,None None,False,False,False,,B0007,False
3,2013-09-29,Falcon 9,PO,VAFB SLC 4E,False Ocean,False,False,False,,B1003,False
4,2013-12-03,Falcon 9,GTO,CCAFS SLC 40,None None,False,False,False,,B1004,False


## Selecting Data

In [525]:
cat_data.nunique()

Date              90
BoosterVersion     1
Orbit             11
LaunchSite         3
Outcome            8
GridFins           2
Reused             2
Legs               2
LandingPad         5
Serial            53
Class              2
dtype: int64

### Select Low Variance Data

Since the BoosterVersion feature has only one value, we decided to drop the BoosterVersion feature which is not relevant to our data mining goal

In [560]:
cat_data.drop("BoosterVersion", axis = 1, inplace = True)
cat_data.head(5)

Unnamed: 0,Date,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
0,2010-06-04,LEO,CCAFS SLC 40,None None,False,False,False,,B0003,False
1,2012-05-22,LEO,CCAFS SLC 40,None None,False,False,False,,B0005,False
2,2013-03-01,ISS,CCAFS SLC 40,None None,False,False,False,,B0007,False
3,2013-09-29,PO,VAFB SLC 4E,False Ocean,False,False,False,,B1003,False
4,2013-12-03,GTO,CCAFS SLC 40,None None,False,False,False,,B1004,False


## Cleaning Data

### Missing Value

Identify and calculate the percentage of the missing value of each feature

In [527]:
cat_data.isnull().sum()/len(cat_data)*100

Date           0.000000
Orbit          0.000000
LaunchSite     0.000000
Outcome        0.000000
GridFins       0.000000
Reused         0.000000
Legs           0.000000
LandingPad    28.888889
Serial         0.000000
Class          0.000000
dtype: float64

The result shows the percentage of the missing value of the LandingPad feature is 28.89% which is lower than 80%. Therefore, we are not consider to drop the whole column or all records with missing value but decided to replace the missing value with the most frequent value.

In [528]:
# cat_data.mode().loc[0,"LandingPad"]
new_cat_data = cat_data.fillna(cat_data.mode().iloc[0,7])
new_cat_data.head(5)

Unnamed: 0,Date,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
0,2010-06-04,LEO,CCAFS SLC 40,None None,False,False,False,5e9e3032383ecb6bb234e7ca,B0003,False
1,2012-05-22,LEO,CCAFS SLC 40,None None,False,False,False,5e9e3032383ecb6bb234e7ca,B0005,False
2,2013-03-01,ISS,CCAFS SLC 40,None None,False,False,False,5e9e3032383ecb6bb234e7ca,B0007,False
3,2013-09-29,PO,VAFB SLC 4E,False Ocean,False,False,False,5e9e3032383ecb6bb234e7ca,B1003,False
4,2013-12-03,GTO,CCAFS SLC 40,None None,False,False,False,5e9e3032383ecb6bb234e7ca,B1004,False


In [529]:
new_cat_data.isnull().sum()

Date          0
Orbit         0
LaunchSite    0
Outcome       0
GridFins      0
Reused        0
Legs          0
LandingPad    0
Serial        0
Class         0
dtype: int64

### Outliers

No outlier detection for categorical data

### Duplicate Data

In [530]:
output.duplicated().sum()

0

There is no duplicate record in the data set

## Handling Categorical Attributes

Before handling the categorical attributes, the number of category of each feature is identified.

In [531]:
# Number of different values per category
orbit_cat = pd.get_dummies(new_cat_data["Orbit"]).shape
launch_site_cat = pd.get_dummies(new_cat_data["LaunchSite"]).shape
landing_pad_cat = pd.get_dummies(new_cat_data["LandingPad"]).shape
serial_cat = pd.get_dummies(new_cat_data["Serial"]).shape

print("Number of category in Orbit: %s"%(orbit_cat[1]))
print("Number of category in LaunchSite: %s"%(launch_site_cat[1]))
print("Number of category in LandingPad: %s"%(landing_pad_cat[1]))
print("Number of category in Serial: %s"%(serial_cat[1]))

Number of category in Orbit: 11
Number of category in LaunchSite: 3
Number of category in LandingPad: 5
Number of category in Serial: 53


Based on the result, we can see that the feature 'Serial' contains 53 different value which will increase 53 columns to the data set after applied the OneHotEncoder. Consider the low amount of records in the data set, the number of features should be limited to avoid the problem of overfitting. Therefore, the feature 'Serial' will not be applied the OneHotDecoder and considered to be dropped due to it hold less amount of decision compared to other features as said in the Categorical Feature Analysis section.

Generate dummies variable to apply OneHotEncoder to the categorical columns Orbits, LaunchSite, and LandingPad.

In [532]:
columns = ["Orbit", "LaunchSite", "LandingPad"]
dummies = pd.get_dummies(new_cat_data[columns])
output = pd.concat([new_cat_data.drop(columns = columns), dummies], axis = 1)
output.head(5)

Unnamed: 0,Date,Outcome,GridFins,Reused,Legs,Serial,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,...,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3032383ecb761634e7cb,LandingPad_5e9e3033383ecbb9e534e7cc
0,2010-06-04,None None,False,False,False,B0003,False,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,2012-05-22,None None,False,False,False,B0005,False,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,2013-03-01,None None,False,False,False,B0007,False,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,2013-09-29,False Ocean,False,False,False,B1003,False,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,2013-12-03,None None,False,False,False,B1004,False,0,0,1,...,0,0,1,0,0,0,0,1,0,0


In [533]:
print("Number of categorical features before applied OneHotEncoder: %s"%(new_cat_data.shape[1]))
print("Number of categorical features after applied OneHotEncoder: %s"%(output.shape[1]))

Number of categorical features before applied OneHotEncoder: 10
Number of categorical features after applied OneHotEncoder: 26


## Feature Selection

### Feature Subset Selection

Identify and calculate the pairwise correlation of each feature pair. Identify and consider whether to drop the feature subset with high pairwise correlation or keep it.

In [534]:
new_cat_data.corr()

Unnamed: 0,GridFins,Reused,Legs,Class
GridFins,1.0,0.229341,0.902302,0.64254
Reused,0.229341,1.0,0.155552,0.207582
Legs,0.902302,0.155552,1.0,0.673825
Class,0.64254,0.207582,0.673825,1.0


From above result, it shows the feature 'GridFins' and 'Legs' has a very high correlation of 0.90 which is larger than the value of 0.80 to consider very high correlation between two attributes. However, based on the feature definition, there is a low correlation between the two features. Therefore, we decided to not drop the features.

### Construct Data

Consider feature 'Date' is not a relevant feature to predict the future landing outcome of the SpaceX rocket. New feature of 'Month' are decided to be constructed by extracting feature from the feature 'Date' as consider it is a useful feature to the prediction of y.

In [535]:
date = output.loc[:, "Date"]
flight_month = []

for i in date:
    flight_month.append(i.split("-")[1])
flight_month_df = pd.DataFrame(flight_month, columns = ["Month"])

output["Date"] = flight_month_df
output.rename(columns = {'Date':'Month'}, inplace = True)
output.head(5)

Unnamed: 0,Month,Outcome,GridFins,Reused,Legs,Serial,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,...,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3032383ecb761634e7cb,LandingPad_5e9e3033383ecbb9e534e7cc
0,6,None None,False,False,False,B0003,False,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5,None None,False,False,False,B0005,False,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,3,None None,False,False,False,B0007,False,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,9,False Ocean,False,False,False,B1003,False,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,12,None None,False,False,False,B1004,False,0,0,1,...,0,0,1,0,0,0,0,1,0,0


### Remove Feature

Drop the redundant feature 'Outcome' due to it is highly correlated to the y variable 'Class' by it contains True to indicate y = 1 and None or False to indicate y = 0. Also, it contains another information to indicate the landing region but the information is considered useless for our data mining goal to only predict whether the rocket can successfully landed or not.

In [536]:
output.drop("Outcome", axis = 1, inplace = True)
output.head(5)

Unnamed: 0,Month,GridFins,Reused,Legs,Serial,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,Orbit_HEO,...,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3032383ecb761634e7cb,LandingPad_5e9e3033383ecbb9e534e7cc
0,6,False,False,False,B0003,False,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5,False,False,False,B0005,False,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,3,False,False,False,B0007,False,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,9,False,False,False,B1003,False,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,12,False,False,False,B1004,False,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0


Drop the feature 'Serial' due to it contains too many number of category which is not suitable to apply OneHotEncoder in this case and the data is very diverse to contribute less to the y prediction.

In [537]:
output.drop("Serial", axis = 1, inplace = True)
output.head(5)

Unnamed: 0,Month,GridFins,Reused,Legs,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,Orbit_HEO,Orbit_ISS,...,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3032383ecb761634e7cb,LandingPad_5e9e3033383ecbb9e534e7cc
0,6,False,False,False,False,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5,False,False,False,False,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,3,False,False,False,False,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
3,9,False,False,False,False,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,12,False,False,False,False,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0


## Feature Scaling

No feature scaling for categorical data

## Split Data

Split data into training and testing data set.

In [538]:
# Load the python library of sklearn
from sklearn.model_selection import train_test_split

x = output.drop(["Class"], axis = 1)
y = output["Class"]

# Split the dataset into training and testing data set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)

print('Original Dataset Shape: ', x.shape)
print('Training Dataset Shape: ', x_train.shape)
print('Testing Dataset Shape:', x_test.shape)

Original Dataset Shape:  (90, 23)
Training Dataset Shape:  (72, 23)
Testing Dataset Shape: (18, 23)
