# SpaceX Assignment
---

# Import Python Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

# Data Collection

In [None]:
data = pd.read_csv("dataset.csv", na_values = "?")

# Data Exploration

In [None]:
data.head(5)

In [None]:
data.isnull().sum()

In [None]:
#Shape of dataset
print("Number of Features: ", data.shape[1])
print("Number of Instances: ", data.shape[0])

In [None]:
data.info()

In [None]:
data.groupby('Class').size()

In [None]:
data.columns

## Categorical Features Analysis

In [None]:
cat_data = data.select_dtypes(include= object)
cat_data.head()

In [None]:
cat_data.select_dtypes('object').nunique()

As we can observe, the date feature has been stored as object data type and later needs to be converted into date type. Also, since the BoosterVersion feature only has one value, we can remove this feature entirely.

In [None]:
for col in cat_data.columns:
        print(col,":")
        print(cat_data[col].value_counts(), "\n")

From observing the categorical data, we can see that there are only nominal features in this dataset. For nominal features, we can simply use one hot encoding as we do not need to reserve the order unlike ordinal features. For the feature Serial, it has 53 unique value which would drastically increase the size after we apply one hot encoding.

In [None]:
df = cat_data.drop(['Date'], axis=1)

fig, axes =plt.subplots(2,3, figsize=(20,10), sharex=True)
axes = axes.flatten()

for ax, col in zip(axes, df):
    sns.countplot(y=col, hue='Class', data=data, ax=ax, order = df[col].value_counts().index, palette='Set2')

plt.tight_layout()  
plt.show()

From the visualization above, we can observe that most of the features except Serial have quite a good correlation with the output class. Upon taking a closer look at Serial, it is very diverse and many of them hold less amount of decision as compared to other features.

## Numerical Features Analysis

In [None]:
data.describe()

## Correlation

1. Gives quantifiable relationship between features
2. Corr is calculated using various methods like 'pearson', 'spearman', etc.
3. Corr can be represented using correlation heatmap

In [None]:
data.corr()

In [None]:
plt.figure(figsize =(10,8))
corrmat = data.corr()
sns.heatmap(corrmat, square=True, annot=True, cbar=True, cmap="Blues");  #Blues
plt.title('- Correlation matrix and Heatmap -',fontsize = 25)

FlightNumber and Block, GridFins and Legs both have strong positive correlation. Hence, we can choose either one feature from each pair and drop it.

---
# Data Visualization

In [None]:
# Visualize the landing outcome of each launch site to know which launch side had the most successful landings 

plt.figure(figsize = (20,8), dpi = 80)
sns.countplot(x = "LaunchSite", data = data, hue = "Outcome", palette = sns.hls_palette(9,l=.7,s=.8));
plt.title("Landing Outcome Of Each Launch Site", fontsize = 15)
plt.xlabel("Launch Site")
plt.ylabel("Landing Outcome")
plt.show()

In [None]:
# Visualize the relationship between payload mass and landing outcome

x = data.loc[:,"PayloadMass"]
y = data.loc[:,"Outcome"]

plt.figure(figsize = (20,8), dpi = 80)
# sns.palplot(sns.hls_palette(9,l=.7,s=.8))
# plt.scatter(x,y)
# sns.catplot(x='PayloadMass',y='Orbit',data=data,hue='Class')

sns.catplot(x = 'PayloadMass',y = 'Outcome',data = data,hue = 'Class')

plt.title("Payload Mass vs Landing Outcome", fontsize = 15)
plt.xlabel("Payload Mass")
plt.ylabel("Landing Outcome")
plt.show()

In [None]:
# Visualize the relationship between the flight month and the landing outcome

date = data.loc[:, "Date"]
flight_month = []

for i in date:
    flight_month.append(i.split("-")[1])
flight_month_df = pd.DataFrame(flight_month, columns = ["Month"])

data_copy = data.copy(deep = False)
data_copy["Date"] = flight_month_df
plt.figure(figsize = (20,8), dpi = 80)
sns.countplot(x = "Date", data = data_copy, hue = "Class", order = ['01','02','03','04','05','06','07','08','09','10','11','12'])
plt.title("Landing Outcome of Each Flight Month", fontsize = 15)
plt.xlabel("Flight Month")
plt.ylabel("Landing Outcome")
plt.show()


In [None]:
visual = data.groupby(['LaunchSite','Longitude','Latitude']).count().reset_index()
visual['TotalMass'] = data.groupby(['LaunchSite','Longitude','Latitude']).sum().reset_index()['PayloadMass'] 
visual['text'] = 'Launch Site: ' + visual['LaunchSite'] + '<br>Number of Flights: ' + visual['Flights'].astype(str) + '<br>Total Payload Mass: ' + visual['TotalMass'].astype(int).astype(str)

fig = go.Figure(data=go.Scattergeo(
        locationmode = 'USA-states',
        lon = visual['Longitude'],
        lat = visual['Latitude'],
        text = visual['text'],
        mode = 'markers',
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'Blues',
            cmin = 0,
            color = visual['Flights'],
            cmax = visual['Flights'].max(),
            colorbar_title="Number of Flights"
        )))

fig.update_layout(
        title = 'Location, Number of Flights and Total Payload Mass from each Launch Site<br>(Hover for more details)',
        geo = dict(
            scope='usa',
            projection_type='albers usa',
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )

fig.show()

In [None]:
data['Status'] = data['Class'].apply(lambda x: 'Success' if x == 1 else 'Failure')

plt.figure(figsize=(12,6))

ax = sns.barplot(x='Status',
                 y='ReusedCount',
                 data=data, 
                 hue='LaunchSite')

plt.title('Relationship between Number of Reused Parts on Flight Success', fontsize=15)
plt.ylabel('Number of Reused Parts')

In [None]:
mpl.style.use(['ggplot'])
df_new = data.groupby(['Orbit', 'Class'])['Class'].count()
df_new.plot(kind='bar', figsize=(20, 5))
plt.title('Total Number of Class (0 or 1) vs Orbit')
plt.ylabel('Total Number of Class (0 or 1)')
plt.xlabel('Orbits')

In [None]:
df_new1 = data.groupby(['Orbit'])['PayloadMass'].sum()
df_new1.plot(kind='bar', figsize=(20, 5))
plt.title('Total Payload Mass vs Orbit')
plt.xlabel('Orbits')
plt.ylabel('Payload Mass')

In [None]:
#Visualization of the number of flights launched by each launch site in each year

#Obtain "Year" values from the date of each flight
def get_year():
    for i in df['Date']:
        year.append(i.split("-")[0])
    return year
   
year = []
df = data.copy()
year = get_year()
df['Date'] = year
#df.head()


plt.figure(figsize = (12,6))
sns.set(style="whitegrid")
sns.countplot(x = 'Date', 
              data = df,
              hue = 'LaunchSite', 
              palette = sns.hls_palette(9,l=.7,s=.8), saturation = 1.5)


plt.title('Number of Flights Launched by Each Launch Site from the year of 2010 to 2020', fontsize = 15)
plt.ylabel('Number of flights launched')
plt.xlabel('Year')
#plt.legend(loc='upper left', frameon=True)
leg = plt.legend(loc='upper left')
leg.get_frame().set_edgecolor('black')
plt.show()

In [None]:
sns.catplot(x='ReusedCount', 
            y='PayloadMass', 
            data=data, 
            hue='Class',  
            col='Class',  
            kind='swarm', 
            palette = 'magma')

#sns.displot(x = 'ReusedCount', data = data[data.ReusedCount.isin([1,2,3,4,5])],
            #col = 'Class', row = 'ReusedCount',
            #hue = 'Orbit', palette = 'colorblind')
            #facet_kws={'despine': True}

plt.subplots_adjust(top = 0.85)
plt.suptitle('Correlation between Payload Mass & Reused Count to Successful/Failed Landing', fontsize = 15)
plt.ylabel('Payload Mass')
plt.xlabel('Reused Count')
#plt.legend(loc='lower right')
plt.show()

In [None]:
data1 = df.loc[df['Outcome'].isin(['True Ocean','True ASDS','True RTLS'])]
#data1.head()
#total = data1.GridFins.count()
sns.set(style="whitegrid")

plt.figure(figsize=(10,5))

vs1 = sns.countplot( data=data1,x='Outcome',hue='GridFins')

plt.title('Number of Successful Landing with GridFins')
plt.ylabel('Total Number')

In [None]:
#Visualization of Successful Landing on Landing Pad
sns.set(style="darkgrid")

plt.figure(figsize=(15,5))

vs1 = sns.countplot(x = 'LandingPad',data=data1,hue='Outcome')

plt.title('Total Number of Success Landing On LandingPad',fontsize = 16)
plt.ylabel('Total Number')
plt.xlabel('Landing Pad Code')

---
# Object Queries

1) To find out how many times the core with specified serial number has been reused

In [None]:
data.groupby(['Serial']).sum()['ReusedCount']

2) To determine the total payload mass for each launch site

In [None]:
#data.groupby(['LaunchSite']).sum()['PayloadMass']
df = data.copy()
oq2 = df.groupby(['LaunchSite']).sum()
print(oq2[['PayloadMass']])

3) To find the outcome with using grid fins

In [None]:
data.groupby(['Outcome']).sum()['GridFins']

4) To determine the total number of successful landing for each orbit type

In [None]:
data.groupby(['Orbit']).sum()['Class']

# Data Preparation (Categorical Data)

In [1141]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   FlightNumber    90 non-null     int64  
 1   Date            90 non-null     object 
 2   BoosterVersion  90 non-null     object 
 3   PayloadMass     90 non-null     float64
 4   Orbit           90 non-null     object 
 5   LaunchSite      90 non-null     object 
 6   Outcome         90 non-null     object 
 7   Flights         90 non-null     int64  
 8   GridFins        90 non-null     bool   
 9   Reused          90 non-null     bool   
 10  Legs            90 non-null     bool   
 11  LandingPad      64 non-null     object 
 12  Block           90 non-null     float64
 13  ReusedCount     90 non-null     int64  
 14  Serial          90 non-null     object 
 15  Longitude       90 non-null     float64
 16  Latitude        90 non-null     float64
 17  Class           90 non-null     int64

## Find Categorical Data

### Data Format Correction

In [1142]:
data["Class"] = data["Class"].astype("bool")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   FlightNumber    90 non-null     int64  
 1   Date            90 non-null     object 
 2   BoosterVersion  90 non-null     object 
 3   PayloadMass     90 non-null     float64
 4   Orbit           90 non-null     object 
 5   LaunchSite      90 non-null     object 
 6   Outcome         90 non-null     object 
 7   Flights         90 non-null     int64  
 8   GridFins        90 non-null     bool   
 9   Reused          90 non-null     bool   
 10  Legs            90 non-null     bool   
 11  LandingPad      64 non-null     object 
 12  Block           90 non-null     float64
 13  ReusedCount     90 non-null     int64  
 14  Serial          90 non-null     object 
 15  Longitude       90 non-null     float64
 16  Latitude        90 non-null     float64
 17  Class           90 non-null     bool 

In [1143]:
cat_data = data.select_dtypes(include = {object, bool})
cat_data.head()

Unnamed: 0,Date,BoosterVersion,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
0,2010-06-04,Falcon 9,LEO,CCAFS SLC 40,None None,False,False,False,,B0003,False
1,2012-05-22,Falcon 9,LEO,CCAFS SLC 40,None None,False,False,False,,B0005,False
2,2013-03-01,Falcon 9,ISS,CCAFS SLC 40,None None,False,False,False,,B0007,False
3,2013-09-29,Falcon 9,PO,VAFB SLC 4E,False Ocean,False,False,False,,B1003,False
4,2013-12-03,Falcon 9,GTO,CCAFS SLC 40,None None,False,False,False,,B1004,False


## Selecting Data

In [1144]:
cat_data.nunique()

Date              90
BoosterVersion     1
Orbit             11
LaunchSite         3
Outcome            8
GridFins           2
Reused             2
Legs               2
LandingPad         5
Serial            53
Class              2
dtype: int64

### Select Low Variance Data

Since the BoosterVersion feature has only one value, we decided to drop the BoosterVersion feature which is not relevant to our data mining goal

In [1172]:
cat_data.drop("BoosterVersion", axis = 1, inplace = True)
cat_data.head(5)

Unnamed: 0,Date,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
0,2010-06-04,LEO,CCAFS SLC 40,None None,False,False,False,,B0003,False
1,2012-05-22,LEO,CCAFS SLC 40,None None,False,False,False,,B0005,False
2,2013-03-01,ISS,CCAFS SLC 40,None None,False,False,False,,B0007,False
3,2013-09-29,PO,VAFB SLC 4E,False Ocean,False,False,False,,B1003,False
4,2013-12-03,GTO,CCAFS SLC 40,None None,False,False,False,,B1004,False


## Split Data

Split data into training and testing data set.

In [1146]:
# Load the python library of sklearn
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing data set
train, test = train_test_split(cat_data, test_size = 0.2, random_state = 2)

print('Original Dataset Shape: ', cat_data.shape)
print('Training Dataset Shape: ', train.shape)
print('Testing Dataset Shape:', test.shape)

Original Dataset Shape:  (90, 10)
Training Dataset Shape:  (72, 10)
Testing Dataset Shape: (18, 10)


## Cleaning Data (Training Dataset)

### Missing Value In Training Dataset

Identify and calculate the percentage of the missing value of each feature in training dataset

In [1147]:
train.isnull().sum()/len(train)*100

Date           0.000000
Orbit          0.000000
LaunchSite     0.000000
Outcome        0.000000
GridFins       0.000000
Reused         0.000000
Legs           0.000000
LandingPad    31.944444
Serial         0.000000
Class          0.000000
dtype: float64

The result shows the percentage of the missing value of the LandingPad feature in the training dataset is 31.94% which is lower than 80%. Therefore, we are not consider to drop the whole column or all records with missing value but decided to replace the missing value with the most frequent value.

In [1148]:
# cat_data.mode().loc[0,"LandingPad"]
new_cat_data = train.fillna(train.mode().iloc[0,7])
new_cat_data.head(5).sort_index()

Unnamed: 0,Date,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
27,2017-03-16,GTO,KSC LC 39A,None None,False,False,False,5e9e3032383ecb6bb234e7ca,B1030,False
30,2017-05-15,GTO,KSC LC 39A,None None,False,False,False,5e9e3032383ecb6bb234e7ca,B1034,False
35,2017-08-14,ISS,KSC LC 39A,True RTLS,True,False,True,5e9e3032383ecb267a34e7c7,B1039,True
44,2018-01-31,GTO,CCAFS SLC 40,True Ocean,True,True,True,5e9e3032383ecb6bb234e7ca,B1032,True
57,2018-11-15,GTO,KSC LC 39A,True ASDS,True,True,True,5e9e3032383ecb6bb234e7ca,B1047,True


In [1149]:
new_cat_data.isnull().sum()

Date          0
Orbit         0
LaunchSite    0
Outcome       0
GridFins      0
Reused        0
Legs          0
LandingPad    0
Serial        0
Class         0
dtype: int64

### Outliers

No outlier detection for categorical data

### Duplicate Data

In [1150]:
new_cat_data.duplicated().sum()

0

There is no duplicate record in the training dataset

## Handling Categorical Attributes (Training Dataset)

Before handling the categorical attributes, the number of category of each feature is identified.

In [1151]:
# Number of different values per category
orbit_cat = pd.get_dummies(new_cat_data["Orbit"]).shape
launch_site_cat = pd.get_dummies(new_cat_data["LaunchSite"]).shape
landing_pad_cat = pd.get_dummies(new_cat_data["LandingPad"]).shape
serial_cat = pd.get_dummies(new_cat_data["Serial"]).shape

print("Number of category in Orbit: %s"%(orbit_cat[1]))
print("Number of category in LaunchSite: %s"%(launch_site_cat[1]))
print("Number of category in LandingPad: %s"%(landing_pad_cat[1]))
print("Number of category in Serial: %s"%(serial_cat[1]))

Number of category in Orbit: 10
Number of category in LaunchSite: 3
Number of category in LandingPad: 4
Number of category in Serial: 45


Based on the result, we can see that the feature 'Serial' in the training set contains 45 different values which will increase 45 columns to the data set after applied the OneHotEncoder. Consider the low amount of records in the data set, the number of features should be limited to avoid the problem of overfitting. Therefore, the feature 'Serial' will not be applied the OneHotDecoder and considered to be dropped due to it hold less amount of decision compared to other features as said in the Categorical Feature Analysis section.

Generate dummies variable to apply OneHotEncoder to the categorical columns Orbits, LaunchSite, and LandingPad.

In [1152]:
columns = ["Orbit", "LaunchSite", "LandingPad"]
dummies = pd.get_dummies(new_cat_data[columns])
output = pd.concat([new_cat_data.drop(columns = columns), dummies], axis = 1)
output.head(5).sort_index()

Unnamed: 0,Date,Outcome,GridFins,Reused,Legs,Serial,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,...,Orbit_SO,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3033383ecbb9e534e7cc
27,2017-03-16,None None,False,False,False,B1030,False,0,0,1,...,0,0,0,0,1,0,0,0,1,0
30,2017-05-15,None None,False,False,False,B1034,False,0,0,1,...,0,0,0,0,1,0,0,0,1,0
35,2017-08-14,True RTLS,True,False,True,B1039,True,0,0,0,...,0,0,0,0,1,0,1,0,0,0
44,2018-01-31,True Ocean,True,True,True,B1032,True,0,0,1,...,0,0,0,1,0,0,0,0,1,0
57,2018-11-15,True ASDS,True,True,True,B1047,True,0,0,1,...,0,0,0,0,1,0,0,0,1,0


In [1153]:
print("Number of categorical features before applied OneHotEncoder: %s"%(new_cat_data.shape[1]))
print("Number of categorical features after applied OneHotEncoder: %s"%(output.shape[1]))

Number of categorical features before applied OneHotEncoder: 10
Number of categorical features after applied OneHotEncoder: 24


## Feature Selection (Training Dataset)

### Feature Subset Selection

Identify and calculate the pairwise correlation of each feature pair. Identify and consider whether to drop the feature subset with high pairwise correlation or keep it.

In [1154]:
new_cat_data.corr()

Unnamed: 0,GridFins,Reused,Legs,Class
GridFins,1.0,0.21924,0.882778,0.647496
Reused,0.21924,1.0,0.127442,0.198341
Legs,0.882778,0.127442,1.0,0.685061
Class,0.647496,0.198341,0.685061,1.0


From above result, it shows the feature 'GridFins' and 'Legs' has a very high correlation of 0.88 which is larger than the value of 0.80 to consider very high correlation between two attributes. However, based on the feature definition, there is a low correlation between the two features. Therefore, we decided to not drop the features.

### Construct Data

Consider feature 'Date' is not a relevant feature to predict the future landing outcome of the SpaceX rocket. New feature of 'Month' are decided to be constructed by extracting feature from the feature 'Date' as consider it is a useful feature to the prediction of y.

In [1155]:
date = output.loc[:, "Date"]
flight_month = []

for i in date:
    flight_month.append(i.split("-")[1])
flight_month_df = pd.DataFrame(flight_month, columns = ["Month"])

output["Date"] = flight_month_df
output.rename(columns = {'Date':'Month'}, inplace = True)
output.head(5).sort_index()

Unnamed: 0,Month,Outcome,GridFins,Reused,Legs,Serial,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,...,Orbit_SO,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3033383ecbb9e534e7cc
27,12,None None,False,False,False,B1030,False,0,0,1,...,0,0,0,0,1,0,0,0,1,0
30,9,None None,False,False,False,B1034,False,0,0,1,...,0,0,0,0,1,0,0,0,1,0
35,1,True RTLS,True,False,True,B1039,True,0,0,0,...,0,0,0,0,1,0,1,0,0,0
44,12,True Ocean,True,True,True,B1032,True,0,0,1,...,0,0,0,1,0,0,0,0,1,0
57,5,True ASDS,True,True,True,B1047,True,0,0,1,...,0,0,0,0,1,0,0,0,1,0


### Remove Feature

Drop the redundant feature 'Outcome' due to it is highly correlated to the y variable 'Class' by it contains True to indicate y = 1 and None or False to indicate y = 0. Also, it contains another information to indicate the landing region but the information is considered useless for our data mining goal to only predict whether the rocket can successfully landed or not.

In [1156]:
output.drop("Outcome", axis = 1, inplace = True)
output.head(5)

Unnamed: 0,Month,GridFins,Reused,Legs,Serial,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,Orbit_ISS,...,Orbit_SO,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3033383ecbb9e534e7cc
30,9,False,False,False,B1034,False,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
27,12,False,False,False,B1030,False,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
44,12,True,True,True,B1032,True,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
35,1,True,False,True,B1039,True,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
57,5,True,True,True,B1047,True,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0


Drop the feature 'Serial' due to it contains too many number of category which is not suitable to apply OneHotEncoder in this case and the data is very diverse to contribute less to the y prediction.

In [1157]:
output.drop("Serial", axis = 1, inplace = True)
output.head(5)

Unnamed: 0,Month,GridFins,Reused,Legs,Class,Orbit_ES-L1,Orbit_GEO,Orbit_GTO,Orbit_ISS,Orbit_LEO,...,Orbit_SO,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb554034e7c9,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3033383ecbb9e534e7cc
30,9,False,False,False,False,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
27,12,False,False,False,False,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
44,12,True,True,True,True,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
35,1,True,False,True,True,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
57,5,True,True,True,True,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


## Feature Scaling

No feature scaling for categorical data

## Cleaning Data (Testing Dataset)

### Missing Value In Testing Dataset

Identify and calculate the percentage of the missing value of each feature in training dataset

In [1158]:
test.isnull().sum()/len(test)*100

Date           0.000000
Orbit          0.000000
LaunchSite     0.000000
Outcome        0.000000
GridFins       0.000000
Reused         0.000000
Legs           0.000000
LandingPad    16.666667
Serial         0.000000
Class          0.000000
dtype: float64

The same method to replace missing value in the training dataset will be applied to the testing dataset. There is 16.67% of missing values in the feature 'LandingPad' in the testing dataset and will be replaced by the most frequent value in the same column of training dataset, which is the value of '5e9e3032383ecb6bb234e7ca'

In [1159]:
new_test_data = test.fillna(train.mode().iloc[0,7])
new_test_data.head(5).sort_index()

Unnamed: 0,Date,Orbit,LaunchSite,Outcome,GridFins,Reused,Legs,LandingPad,Serial,Class
36,2017-08-24,SSO,VAFB SLC 4E,True ASDS,True,False,True,5e9e3033383ecbb9e534e7cc,B1038,True
48,2018-04-18,HEO,CCAFS SLC 40,True ASDS,True,False,True,5e9e3032383ecb6bb234e7ca,B1045,True
77,2020-04-22,VLEO,KSC LC 39A,True ASDS,True,True,True,5e9e3032383ecb6bb234e7ca,B1051,True
79,2020-06-04,VLEO,CCAFS SLC 40,True ASDS,True,True,True,5e9e3033383ecbb9e534e7cc,B1049,True
81,2020-06-30,MEO,CCAFS SLC 40,True ASDS,True,False,True,5e9e3033383ecbb9e534e7cc,B1060,True


In [1160]:
new_test_data.isnull().sum()

Date          0
Orbit         0
LaunchSite    0
Outcome       0
GridFins      0
Reused        0
Legs          0
LandingPad    0
Serial        0
Class         0
dtype: int64

### Duplicate Data

In [1161]:
new_test_data.duplicated().sum()

0

There is no duplicate record in the testing dataset.

## Handling Categorical Attributes (Testing Dataset)

In [1162]:
# Number of different values per category
orbit_cat = pd.get_dummies(new_test_data["Orbit"]).shape
launch_site_cat = pd.get_dummies(new_test_data["LaunchSite"]).shape
landing_pad_cat = pd.get_dummies(new_test_data["LandingPad"]).shape
serial_cat = pd.get_dummies(new_test_data["Serial"]).shape

print("Number of category in Orbit: %s"%(orbit_cat[1]))
print("Number of category in LaunchSite: %s"%(launch_site_cat[1]))
print("Number of category in LandingPad: %s"%(landing_pad_cat[1]))
print("Number of category in Serial: %s"%(serial_cat[1]))

Number of category in Orbit: 8
Number of category in LaunchSite: 3
Number of category in LandingPad: 4
Number of category in Serial: 16


The feature 'Serial' will not be applied the OneHotDecoder as same as in the training dataset. Generate dummies variable to apply OneHotEncoder to the categorical columns Orbits, LaunchSite, and LandingPad.

In [1163]:
columns = ["Orbit", "LaunchSite", "LandingPad"]
dummies = pd.get_dummies(new_test_data[columns])
test_output = pd.concat([new_test_data.drop(columns = columns), dummies], axis = 1)
test_output.head(5).sort_index()

Unnamed: 0,Date,Outcome,GridFins,Reused,Legs,Serial,Class,Orbit_GTO,Orbit_HEO,Orbit_ISS,...,Orbit_PO,Orbit_SSO,Orbit_VLEO,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3032383ecb761634e7cb,LandingPad_5e9e3033383ecbb9e534e7cc
36,2017-08-24,True ASDS,True,False,True,B1038,True,0,0,0,...,0,1,0,0,0,1,0,0,0,1
48,2018-04-18,True ASDS,True,False,True,B1045,True,0,1,0,...,0,0,0,1,0,0,0,1,0,0
77,2020-04-22,True ASDS,True,True,True,B1051,True,0,0,0,...,0,0,1,0,1,0,0,1,0,0
79,2020-06-04,True ASDS,True,True,True,B1049,True,0,0,0,...,0,0,1,1,0,0,0,0,0,1
81,2020-06-30,True ASDS,True,False,True,B1060,True,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [1164]:
print("Number of categorical features before applied OneHotEncoder: %s"%(new_test_data.shape[1]))
print("Number of categorical features after applied OneHotEncoder: %s"%(test_output.shape[1]))

Number of categorical features before applied OneHotEncoder: 10
Number of categorical features after applied OneHotEncoder: 22


Compare the features those had applied OneHotDecoder in the training dataset and the testing dataset and ensure the testing dataset has the same features with the training dataset

In [1165]:
# Drop the features those are in the testing dataset but not in the training data set

train_column = output.iloc[:, 5:].columns
test_column = test_output.iloc[:, 7:].columns

drop_column = []
for i in test_column:
    if not (i in train_column):
        drop_column.append(i)
test_output.drop(columns = drop_column, axis = 1, inplace = True)
test_output.iloc[:, 7:].shape

(18, 13)

In [1166]:
# Add the features those are in the training dataset but not in the testing dataset, replace the missing values with 0

train_column = output.iloc[:, 5:].columns
test_column = test_output.iloc[:, 7:].columns

add_column = []
for i in train_column:
    if not (i in test_column):
        add_column.append(i)
for j in add_column:
    test_output["%s"%j] = 0
test_output.iloc[:, 7:].shape

(18, 17)

In [1167]:
# Compare is the training dataset and the testing dataset contain the same features after applied the OneHotEncoder
train_test = pd.concat([output.iloc[:, 5:], test_output.iloc[:, 7:]], axis = 0)
train_test.shape

(90, 17)

In [1168]:
print("Number of categorical features before applied OneHotEncoder: %s"%(new_test_data.shape[1]))
print("Number of categorical features after applied OneHotEncoder: %s"%(test_output.shape[1]))

Number of categorical features before applied OneHotEncoder: 10
Number of categorical features after applied OneHotEncoder: 24


## Feature Selection (Testing Dataset)

In [1169]:
# Drop the features those had dropped in the training dataset
# Drop the feature 'Date' and add new feature of 'Month' extracted from Date

test_date = test_output.loc[:, "Date"]
test_flight_month = []

for i in test_date:
    test_flight_month.append(i.split("-")[1])
test_flight_month_df = pd.DataFrame(test_flight_month)

test_output.rename(columns = {'Date':'Month'}, inplace = True)
test_output["Month"] = test_flight_month_df.values
test_output.head(5).sort_index()

Unnamed: 0,Month,Outcome,GridFins,Reused,Legs,Serial,Class,Orbit_GTO,Orbit_ISS,Orbit_LEO,...,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3033383ecbb9e534e7cc,Orbit_ES-L1,Orbit_GEO,Orbit_SO,LandingPad_5e9e3032383ecb554034e7c9
36,8,True ASDS,True,False,True,B1038,True,0,0,0,...,0,0,1,0,0,1,0,0,0,0
48,4,True ASDS,True,False,True,B1045,True,0,0,0,...,1,0,0,0,1,0,0,0,0,0
77,4,True ASDS,True,True,True,B1051,True,0,0,0,...,0,1,0,0,1,0,0,0,0,0
79,6,True ASDS,True,True,True,B1049,True,0,0,0,...,1,0,0,0,0,1,0,0,0,0
81,6,True ASDS,True,False,True,B1060,True,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [1170]:
# Drop the feature 'Outcome' and 'Serial'

test_output.drop("Outcome", axis = 1, inplace = True)
test_output.drop("Serial", axis = 1, inplace = True)
test_output.head(5)

Unnamed: 0,Month,GridFins,Reused,Legs,Class,Orbit_GTO,Orbit_ISS,Orbit_LEO,Orbit_MEO,Orbit_PO,...,LaunchSite_CCAFS SLC 40,LaunchSite_KSC LC 39A,LaunchSite_VAFB SLC 4E,LandingPad_5e9e3032383ecb267a34e7c7,LandingPad_5e9e3032383ecb6bb234e7ca,LandingPad_5e9e3033383ecbb9e534e7cc,Orbit_ES-L1,Orbit_GEO,Orbit_SO,LandingPad_5e9e3032383ecb554034e7c9
48,4,True,False,True,True,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
77,4,True,True,True,True,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
81,6,True,False,True,True,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
79,6,True,True,True,True,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
36,8,True,False,True,True,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


## Finalize Training Dataset And Testing Dataset

In [1171]:
cat_training_dataset = output
cat_testing_dataset = test_output

print("Shape of training dataset: {}".format(cat_training_dataset.shape))
print("Shape of testing dataset: {}".format(cat_testing_dataset.shape))

Shape of training dataset: (72, 22)
Shape of testing dataset: (18, 22)
