In [2]:
import pandas as pd
from xgboost import XGBClassifier
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OrdinalEncoder, TargetEncoder, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score

In [84]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#https://stackoverflow.com/questions/59254662/sklearn-columntransformer-with-multilabelbinarizer

class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note
    that input X has to be a `pandas.DataFrame`.
    """
    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        X_df = pd.DataFrame(X)
        for i in range(X_df.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X_df.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame, y=None):
        X = pd.DataFrame(X)
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

In [4]:
data_raw = pd.read_csv('UCS-Satellite-Database.csv')
data = data_raw[data_raw.columns[2:26]].drop(['Power (watts)','Dry Mass (kg.)','COSPAR Number','Detailed Purpose', 'Date of Launch'], axis=1)
data['Country of Contractor'] = data['Country of Contractor'].str.strip()
data['Country of Operator/Owner'] = data['Country of Operator/Owner'].str.strip()
data['Operator/Owner'] = data['Operator/Owner'].str.strip()
data['Users'] = data['Users'].str.strip()
data.loc[data['Country of Contractor'] == 'Swizerland', 'Country of Contractor'] = 'Switzerland'
data.loc[data['Country of Operator/Owner'] == 'Sinapore', 'Country of Operator/Owner'] = 'Singapore'
data.loc[data['Country of Operator/Owner'] == 'United Kingdom', 'Country of Operator/Owner'] = 'United Kingdom'
data.loc[data['Country of Operator/Owner'] == 'Poland/UK', 'Country of Operator/Owner'] = 'Poland/United Kingdom'
data.loc[data['Operator/Owner'] == 'Spacex', 'Operator/Owner'] = 'SpaceX'
data['Country/Org of UN Registry'] = data['Country/Org of UN Registry'].replace(to_replace=np.nan, value='Unknown')
data['Type of Orbit'] = data['Type of Orbit'].replace(to_replace=np.nan, value='Unknown')
#data.loc[data['Date of Launch'] == '11/29/018','Date of Launch'] = '11/29/2018'
#data.loc[data['Date of Launch'] == '1/9//2023','Date of Launch'] = '1/9/2023'
#data['Date of Launch'] = pd.to_datetime(data['Date of Launch'])
data = data.drop(240)

In [5]:
df = data
df['Country of Operator/Owner'] = df['Country of Operator/Owner'].str.split('/')
df['Country of Contractor'] = df['Country of Contractor'].str.split('/')
df['Contractor'] = df['Contractor'].str.split('/')

value_counts = df['Purpose'].value_counts()
usable_purposes = value_counts[value_counts >= 8].index
df = df[df['Purpose'].isin(usable_purposes)]

In [26]:
num_cols = list(df.columns.to_series().groupby(df.dtypes).groups[np.dtype('float64')])
cat_cols = list(df.columns.to_series().groupby(df.dtypes).groups[np.dtype('object')])

num_mode = ['Longitude of GEO (degrees)', 'Inclination (degrees)']

cat_multi = ['Country of Operator/Owner','Contractor','Country of Contractor']

cat_one = [col for col in cat_cols if col not in cat_multi]
cat_one.remove('Purpose')

num_mean = [col for col in num_cols if col not in num_mode]

X = df.drop(columns=['Purpose'])
y = df[['Purpose']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

targ_enc = OneHotEncoder(sparse_output = False, dtype = np.int32)
targ_enc.fit(y)

y_train_enc = pd.DataFrame(targ_enc.transform(y_train), columns = targ_enc.categories_)
y_test_enc = pd.DataFrame(targ_enc.transform(y_test), columns = targ_enc.categories_)

In [47]:
train_dict = {}
test_dict = {}
for i, j in enumerate(y['Purpose'].unique()):
    train_dict[j] = y_train_enc[j]
    
for i, j in enumerate(y['Purpose'].unique()):
    test_dict[j] = y_test_enc[j]

In [82]:
mean_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])
mode_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent'))
])
multi_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'constant', fill_value = ['Unknown'])),
    ('MultiLabel_Binarizer', MultiLabelBinarizerWrapper())
])
cat_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'constant', fill_value = 'Unknown')),
    ('target_encoder', TargetEncoder())
])

col_trans = ColumnTransformer(transformers = [
    ('mean_pipe', mean_pipe, num_mean),
    ('mode_pipe', mode_pipe, num_mode),
    #('date_time', date_pipe, date_time),
    ('multi_pipe_1', multi_pipe, cat_multi),
    ('cat_pipe', cat_pipe, cat_one)
])

final_pipe = Pipeline(steps = [
    ('col_trans', col_trans),
    ('clf', XGBClassifier(random_state = 42))
])

search_space = {
    'clf__max_depth': Integer(2, 10),
    'clf__learning_rate': Real(0.001, 1.0, prior = 'log-uniform'),
    'clf__subsample': Real(0.01,1),
    'clf__colsample_bytree': Real(0.01, 1.0),
    'clf__colsample_bylevel': Real(0.01, 1.0),
    'clf__colsample_bynode': Real(0.01, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

In [85]:
pipelines = {}

for i, j in enumerate(y['Purpose'].unique()):
    temp_pipe = BayesSearchCV(final_pipe, search_space, cv=3, random_state = 42)
    pipelines[j] = temp_pipe.fit(X_train, train_dict[j])
    print('Finished')

Finished
Finished
Finished
Finished
Finished
Finished
Finished
Finished
Finished
Finished
Finished
Finished


In [87]:
for i, j in enumerate(pipelines.keys()):
    print(pipelines[j].best_score_)

0.9677258895422668
0.9665616345940172
0.989020212196675
0.9953419006553377
0.9888538781980056
0.994842566489587
0.998502744884669
1.0
0.9988354959244433
0.998669161925774
0.9976709918488867
0.998502744884669


In [54]:
test_pipe.best_params_

OrderedDict([('clf__colsample_bylevel', 0.5),
             ('clf__colsample_bynode', 1.0),
             ('clf__colsample_bytree', 1.0),
             ('clf__gamma', 0.0),
             ('clf__learning_rate', 0.138263845331543),
             ('clf__max_depth', 10),
             ('clf__reg_alpha', 0.0),
             ('clf__reg_lambda', 8.036159241145596),
             ('clf__subsample', 0.5)])

In [90]:
predictions = {}
for i, j in enumerate(pipelines.keys()):
    predictions[j] = pipelines[j].predict(X_test)

In [93]:
for i, j in enumerate(pipelines.keys()):
    print(j)
    display((test_dict[j].squeeze() == predictions[j]).value_counts())

Earth Observation


(Earth Observation,)
True     1433
False      70
Name: count, dtype: int64

Technology Development


(Technology Development,)
True     1452
False      51
Name: count, dtype: int64

Communications


(Communications,)
True     1472
False      31
Name: count, dtype: int64

Earth Science


(Earth Science,)
True    1503
Name: count, dtype: int64

Space Science


(Space Science,)
True     1484
False      19
Name: count, dtype: int64

Technology Demonstration


(Technology Demonstration,)
True     1496
False       7
Name: count, dtype: int64

Unknown


(Unknown,)
True     1502
False       1
Name: count, dtype: int64

Navigation/Global Positioning


(Navigation/Global Positioning,)
True    1503
Name: count, dtype: int64

Earth Observation/Navigation


(Earth Observation/Navigation,)
True     1501
False       2
Name: count, dtype: int64

Space Observation


(Space Observation,)
True     1502
False       1
Name: count, dtype: int64

Surveillance


(Surveillance,)
True     1497
False       6
Name: count, dtype: int64

Navigation/Regional Positioning


(Navigation/Regional Positioning,)
True     1499
False       4
Name: count, dtype: int64

In [79]:
test = test_dict['Communications'].squeeze()

In [81]:
(test == answers).value_counts()

True     1473
False      30
Name: count, dtype: int64