In [1]:
import pandas as pd
from xgboost import XGBClassifier
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OrdinalEncoder, TargetEncoder, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score

In [2]:
#https://stackoverflow.com/questions/59254662/sklearn-columntransformer-with-multilabelbinarizer

class MultiLabelBinarizerWrapper(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note
    that input X has to be a `pandas.DataFrame`.
    """
    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        X_df = pd.DataFrame(X)
        for i in range(X_df.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X_df.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame, y=None):
        X = pd.DataFrame(X)
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

In [3]:
data_raw = pd.read_csv('UCS-Satellite-Database.csv')
data = data_raw[data_raw.columns[2:26]].drop(['Power (watts)','Dry Mass (kg.)','COSPAR Number','Detailed Purpose', 'Date of Launch'], axis=1)
data['Country of Contractor'] = data['Country of Contractor'].str.strip()
data['Country of Operator/Owner'] = data['Country of Operator/Owner'].str.strip()
data['Operator/Owner'] = data['Operator/Owner'].str.strip()
data['Users'] = data['Users'].str.strip()
data.loc[data['Country of Contractor'] == 'Swizerland', 'Country of Contractor'] = 'Switzerland'
data.loc[data['Country of Operator/Owner'] == 'Sinapore', 'Country of Operator/Owner'] = 'Singapore'
data.loc[data['Country of Operator/Owner'] == 'United Kingdom', 'Country of Operator/Owner'] = 'United Kingdom'
data.loc[data['Country of Operator/Owner'] == 'Poland/UK', 'Country of Operator/Owner'] = 'Poland/United Kingdom'
data.loc[data['Operator/Owner'] == 'Spacex', 'Operator/Owner'] = 'SpaceX'
data['Country/Org of UN Registry'] = data['Country/Org of UN Registry'].replace(to_replace=np.nan, value='Unknown')
data['Type of Orbit'] = data['Type of Orbit'].replace(to_replace=np.nan, value='Unknown')
#data.loc[data['Date of Launch'] == '11/29/018','Date of Launch'] = '11/29/2018'
#data.loc[data['Date of Launch'] == '1/9//2023','Date of Launch'] = '1/9/2023'
#data['Date of Launch'] = pd.to_datetime(data['Date of Launch'])
data = data.drop(240)

In [4]:
df = data
df['Country of Operator/Owner'] = df['Country of Operator/Owner'].str.split('/')
df['Country of Contractor'] = df['Country of Contractor'].str.split('/')
df['Contractor'] = df['Contractor'].str.split('/')

value_counts = df['Purpose'].value_counts()
usable_purposes = value_counts[value_counts >= 8].index
df = df[df['Purpose'].isin(usable_purposes)]

In [48]:
num_cols = list(df.columns.to_series().groupby(df.dtypes).groups[np.dtype('float64')])
cat_cols = list(df.columns.to_series().groupby(df.dtypes).groups[np.dtype('object')])

num_mode = ['Longitude of GEO (degrees)', 'Inclination (degrees)']

cat_multi = ['Country of Operator/Owner','Contractor','Country of Contractor']

#date_time = ['Date of Launch']

cat_one = [col for col in cat_cols if col not in cat_multi]
cat_one.remove('Purpose')

num_mean = [col for col in num_cols if col not in num_mode]

X = df.drop(columns=['Purpose'])
y = df[['Purpose']]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

mean_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])
mode_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent'))
])
multi_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'constant', fill_value = ['Unknown'])),
    ('MultiLabel_Binarizer', MultiLabelBinarizerWrapper())
])
cat_pipe = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'constant', fill_value = 'Unknown')),
    ('target_encoder', TargetEncoder())
])

col_trans = ColumnTransformer(transformers = [
    ('mean_pipe', mean_pipe, num_mean),
    ('mode_pipe', mode_pipe, num_mode),
    #('date_time', date_pipe, date_time),
    ('multi_pipe_1', multi_pipe, cat_multi),
    ('cat_pipe', cat_pipe, cat_one)
])

final_pipe = Pipeline(steps = [
    ('col_trans', col_trans),
    ('clf', XGBClassifier(random_state = 42))
])

search_space = {
    'clf__max_depth': Integer(2, 8),
    'clf__learning_rate': Real(0.001, 1.0, prior = 'log-uniform'),
    'clf__subsample': Real(0.5,1),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode': Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

opt_pipe = BayesSearchCV(final_pipe, search_space, cv=3, n_iter=10)

In [49]:
lbl = LabelEncoder()
lbl.fit(y_train['Purpose'])
y_train_clean = pd.DataFrame(lbl.transform(y_train['Purpose']), columns = ['Purpose'])

In [50]:
opt_pipe.fit(X_train, y_train_clean)































In [55]:
answers = opt_pipe.predict(X_test)



In [56]:
test = lbl.transform(y_test['Purpose'])

In [57]:
answers = pd.Series(answers)

In [58]:
(test == answers).value_counts()

True     1389
False     114
Name: count, dtype: int64