# Feature Selection

Feature selection based on the importance of XGBoost Model

Re-train the model and compare the results.


In [None]:
select_col = [38, 39, 43, 44]

dataset = np.load('data/satellite_state.npy')
X = dataset[:, :-1]
y = dataset[-1]
x_train, x_test, y_train, y_test = train_test_split(
    dataset[:, :-1], dataset[:, -1], random_state=1)
x_train = x_train[:, select_col]
x_test = x_test[:, select_col]

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

evallist = [(dtest, 'eval'), (dtrain, 'train')]

since = time.time()
num_round = 200

param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
xgb_model = xgb.train(param, dtrain, num_round, evallist)
#xgb_model.fit(x_train, y_train)

xgb.plot_tree(xgb_model, num_trees=0)
xgb.plot_importance(xgb_model)
plt.show()

time_elapsed = time.time() - since
print('Training complete in {:.2f}s'.format(time_elapsed))
predictions = xgb_model.predict(dtest)
predictions = np.array(predictions > 0.5)

cm_perf = (confusion_matrix(y_test, predictions))
print("Confusion matrix: \n", cm_perf)
perf = perf_parse(cm_perf)

## Different models

Feature selection using different classification methods.

- LR
- SVM
- Neural network

In [None]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from test_xgboost import perf_parse
    
since = time.time()
# Standard preprocess the training data
scaler = StandardScaler().fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

lr_model = LogisticRegression(max_iter=20000)
lr_model.fit(x_train_s, y_train)
time_elapsed = time.time() - since
print('Training complete in {:.2f}s'.format(time_elapsed))

predictions = lr_model.predict(x_train_s)
cm_perf = (confusion_matrix(y_train, predictions))
print("Confusion matrix: \n", cm_perf)
perf = perf_parse(cm_perf)

print("LR train score: {0:.3f}".format(lr_model.score(x_train_s, y_train)))
print("LR test score: {0:.3f}".format(lr_model.score(x_test_s, y_test)))

In [None]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from test_xgboost import perf_parse


since = time.time()
# Standard preprocess the training data
scaler = StandardScaler().fit(x_train)
x_train_s = scaler.transform(x_train)
x_test_s = scaler.transform(x_test)

svm_model = svm.SVC()
svm_model.fit(x_train_s, y_train)

time_elapsed = time.time() - since
print('Training complete in {:.2f}s'.format(time_elapsed))

predictions = svm_model.predict(x_train_s)
cm_perf = (confusion_matrix(y_train, predictions))
print("Confusion matrix: \n", cm_perf)
perf = perf_parse(cm_perf)

print("SVM train score: {0:.3f}".format(svm_model.score(x_train_s, y_train)))
print("SVM test score: {0:.3f}".format(svm_model.score(x_test_s, y_test)))

In [None]:

import numpy as np
import pandas as pd
import copy
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler

# Accuracy
def get_accuracy(y_pred, y_target):
    n_correct = torch.eq(y_pred, y_target).sum().item()
    accuracy = n_correct / len(y_pred) * 100
    return accuracy

# Multilayer Perceptron
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size[0])
        self.fc2 = nn.Linear(hidden_size[0], hidden_size[1])
        self.fc3 = nn.Linear(hidden_size[1], num_classes)

    def forward(self, x_in, apply_softmax=False):
        a_1 = F.relu(self.fc1(x_in))
        a_2 = F.relu(self.fc2(a_1))
        y_pred = self.fc3(a_2)
        if apply_softmax:
            y_pred = F.softmax(y_pred, dim=1)

        return y_pred


# Standard preprocess the training data
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

x_train = torch.from_numpy(x_train).float()
y_train = torch.from_numpy(y_train).long()
x_test = torch.from_numpy(x_test).float()
y_test = torch.from_numpy(y_test).long()
score = 100

# Device configuration
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
# Model configuration
[m_train, n_train] = x_train.shape
input_size = n_train
hidden_size = [60, 30]
num_classes = 2
# Train configuration
num_epochs = 1000
learning_rate = 0.01
dropout_p = 0.5
step_size = 500

model = MLP(input_size=input_size,
            hidden_size=hidden_size,
            num_classes=num_classes)

model = model.to(device)
x_train = x_train.to(device)
y_train = y_train.to(device)
x_test = x_test.to(device)
y_test = y_test.to(device)

# Optimization
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.5)

since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
# Training
for t in range(num_epochs):
    # Forward pass
    y_pred = model(x_train)

    # Accuracy
    _, predictions = y_pred.max(dim=1)
    accuracy = get_accuracy(y_pred=predictions.long(), y_target=y_train)

    # Loss
    loss = loss_fn(y_pred, y_train)

    if t % 10 == 0:
        _, pred_test = model(x_test, apply_softmax=True).max(dim=1)
        test_acc = get_accuracy(y_pred=pred_test, y_target=y_test)
        # deep copy the model
        if test_acc > best_acc:
            best_acc = test_acc
            best_model_wts = copy.deepcopy(model.state_dict())
        if t % 500 == 0:
            print("epoch: {0:4d} | loss: {1:.4f} | Train accuracy: {2:.1f}% | Test accuracy: {3:.1f}%"
                  .format(t, loss, accuracy, test_acc))

    # Zero all gradients
    optimizer.zero_grad()
    # Backward pass
    loss.backward()
    # Update weights
    optimizer.step()

    scheduler.step()

time_elapsed = time.time() - since
print('Training complete in {:.2f}s'.format(time_elapsed))
#print('Best val Acc: {:4f}'.format(best_acc))
model.load_state_dict(best_model_wts)

# Predictions
_, pred_train = model(x_train, apply_softmax=True).max(dim=1)
_, pred_test = model(x_test, apply_softmax=True).max(dim=1)

# Train and test accuracies
train_acc = get_accuracy(y_pred=pred_train, y_target=y_train)
test_acc = get_accuracy(y_pred=pred_test, y_target=y_test)
print("train acc: {0:.1f}%, test acc: {1:.1f}%".format(
    train_acc, test_acc))

y_true = y_test.cpu().numpy()
y_pred = pred_test.cpu().numpy()

cm_perf = confusion_matrix(y_true, y_pred)
report = classification_report(y_true, y_pred)
print(cm_perf)
acc = (cm_perf[1, 1] + cm_perf[0, 0]) / np.sum(cm_perf)
recall = cm_perf[1, 1] / (cm_perf[1, 0] + cm_perf[1, 1])
precision = cm_perf[1, 1] / (cm_perf[0, 1] + cm_perf[1, 1])
score = 2 / ((1 / recall) + (1 / precision))
model_perf = torch.tensor([acc, precision, recall, score])