In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
def fill(data, col, imp):
    X = pd.DataFrame(df[col])
    imp.fit(X)
    df[col] = imp.transform(X)
    return df

In [3]:
data_raw = pd.read_csv('UCS-Satellite-Database.csv')
data = data_raw[data_raw.columns[2:26]].drop(['Power (watts)','Dry Mass (kg.)','COSPAR Number','Detailed Purpose'], axis=1)
data['Country of Contractor'] = data['Country of Contractor'].str.strip()
data['Country of Operator/Owner'] = data['Country of Operator/Owner'].str.strip()
data['Operator/Owner'] = data['Operator/Owner'].str.strip()
data.loc[data['Country of Contractor'] == 'Swizerland', 'Country of Contractor'] = 'Switzerland'
data.loc[data['Country of Operator/Owner'] == 'Sinapore', 'Country of Operator/Owner'] = 'Singapore'
data.loc[data['Country of Operator/Owner'] == 'United Kingdom', 'Country of Operator/Owner'] = 'United Kingdom'
data.loc[data['Country of Operator/Owner'] == 'Poland/UK', 'Country of Operator/Owner'] = 'Poland/United Kingdom'
data.loc[data['Operator/Owner'] == 'Spacex', 'Operator/Owner'] = 'SpaceX'
data['Country/Org of UN Registry'] = data['Country/Org of UN Registry'].replace(to_replace=np.nan, value='Unknown')
data['Type of Orbit'] = data['Type of Orbit'].replace(to_replace=np.nan, value='Unknown')
data.loc[data['Date of Launch'] == '11/29/018','Date of Launch'] = '11/29/2018'
data.loc[data['Date of Launch'] == '1/9//2023','Date of Launch'] = '1/9/2023'
data = data.drop(240)

In [4]:
df = data
counts = df['Operator/Owner'].value_counts()
unique = counts[counts <= 3].index
df['Operator/Owner'] = df['Operator/Owner'].apply(lambda x: 'Other' if x in unique else x)
df['Country of Operator/Owner'] = df['Country of Operator/Owner'].str.split('/')
df['Country of Contractor'] = df['Country of Contractor'].str.split('/')
df['Contractor'] = df['Contractor'].str.split('/')

In [5]:
mlb1 = MultiLabelBinarizer()
mlb2 = MultiLabelBinarizer()
mlb3 = MultiLabelBinarizer()
country_operator_owner = pd.DataFrame(mlb1.fit_transform(df['Country of Operator/Owner']), columns=mlb1.classes_ + ' Operator/Owner', 
                       index=df['Country of Operator/Owner'].index)
contractor_country = pd.DataFrame(mlb2.fit_transform(df['Country of Contractor']), columns=mlb2.classes_ + ' Contractor', 
                       index=df['Country of Contractor'].index)
contractor = pd.DataFrame(mlb3.fit_transform(df['Contractor']), columns=mlb3.classes_, 
                       index=df['Contractor'].index)

imp_mean = SimpleImputer(missing_values = np.nan, strategy='mean')
imp_mode = SimpleImputer(missing_values = np.nan, strategy='most_frequent')

cols = np.array(df.columns.to_series().groupby(df.dtypes).groups[np.dtype('float64')])
means = np.delete(cols, [0,4])
modes = cols[[0,4]]

for i in means:
    df = fill(df, i, imp_mean)

for i in modes:
    df = fill(df, i, imp_mode)
    
label = np.array(df.keys())[[0,2,3,4,5,6,18,19]]
for i in label:
    lab = LabelEncoder()
    lab.fit(df[i])
    df[i] = lab.transform(df[i])
    
df = df.merge(country_operator_owner, left_index=True,right_index=True)
df = df.merge(contractor_country, left_index=True, right_index=True)
df = df.merge(contractor, left_index=True, right_index=True)
df = df.drop(['Country of Operator/Owner', 'Contractor', 'Country of Contractor'], axis=1)
df['Date of Launch'] = pd.to_datetime(df['Date of Launch'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7559 entries, 0 to 7559
Columns: 765 entries, Country/Org of UN Registry to iQPS
dtypes: datetime64[ns](1), float64(8), int64(756)
memory usage: 44.2 MB
