# Iris Case Study: Encoding string target variables

## Load Libraries

In [2]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Load Data

In [4]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
data = read_csv(url, header = None)

## Split data into X/Y

In [5]:
X = data.values[:,0:4]
Y = data.values[:,4]

## Encode String Class (target variable) as integers

In [6]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)

## Split into train/test sets

In [7]:
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, label_encoded_y,
                                                    test_size = 0.33, 
                                                    random_state=seed)

## Fit Model on Training Set

In [8]:
model = XGBClassifier()
model.fit(X_train, Y_train)
print(model)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


## Make predictions and evaluate

In [12]:
#Predict
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
#Evaluate
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 92.00%


# Breast Cancer Case Study: categorical datasets
Some datasets only contain categorical data, for example the breast cancer dataset. This dataset describes the technical details of breast cancer biopsies and the prediction task is to predict whether or not the patient has a recurrence of cancer, or not.

## Load Libraries

In [35]:
import numpy
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Load Data

In [26]:
filename = 'breastcancer.csv'
data = read_csv(filename, header = None, delimiter=",", skiprows = 105, nrows = 286)
data.shape

(286, 10)

In [28]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'


In [51]:
data.dtypes

0    object
1    object
2    object
3    object
4    object
5    object
6    object
7    object
8    object
9    object
dtype: object

All 9 input variables are categorical and described in string format. The problem is a binary classification prediction problem and the output class values are also described in string format. They need to be encoded.

XGBoost may assume that encoded integer values for each input variable have an ordinal relationship. For example that left-up encoded as 0 and left-low encoded as 1 for the breast-quad variable have a meaningful relationship as integers. In this case, this assumption is untrue. Instead, we must map these integer values onto new binary variables, one new variable for each categorical value.

This can be modeled as 5 binary variables in a process called ONE HOT ENCODING using the OneHotEncoder class in sklearn. 

## Encode variables

In [58]:
from sklearn.preprocessing import OneHotEncoder
from numpy import column_stack

#Split input/target
X = data.values[:,1:9]
X = X.astype(str)
Y = data.values[:,9]

#Encode string input variables
columns = []
for i in range(0, X.shape[1]):
    label_encoder = LabelEncoder()
    feature = label_encoder.fit_transform(X[:,i])
    feature = feature.reshape(X.shape[0], 1)
    onehot_encoder = OneHotEncoder(sparse=False)
    feature = onehot_encoder.fit_transform(feature)
    columns.append(feature)

#Collapse columns into array
encoded_x = column_stack(columns)
print("X shape: : ", encoded_x.shape)

#Encode string target variable
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
encoded_y = label_encoder.transform(Y)

X shape: :  (286, 37)


## Split into Train/Test sets; fit model, predict and evaluate

In [62]:
# Split Train/Test
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(encoded_x, encoded_y, 
                                                   test_size = test_size, random_state = seed)

# Fit Model
model = XGBClassifier()
model.fit(X_train, Y_train)
print(model)

# Predict and Evaluate
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
Accuracy: 71.58%


# Horse Colic case study: XGBoost Support for Missing Data
XGBoost can automatically learn how to best handle missing data. In fact, XGBoost was designed to work with sparse data, like the one hot encoded data from the previous section, and missing data is handled the same way that sparse or zero values are handled, by minimizing the loss function.

The Horse Colic dataset is a good example to demonstrate this capability as it contains a
large percentage of missing data, approximately 30%.

In [69]:
#Load Libraries
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

#Load Data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data"
data = read_csv(url, delim_whitespace = True, header = None)
data.shape

(300, 28)

In [70]:
print(data.head(5))

  0   1        2      3    4   5  6  7  8  9  ...     18    19 20    21 22 23  \
0  2   1   530101  38.50   66  28  3  3  ?  2 ...  45.00  8.40  ?     ?  2  2   
1  1   1   534817   39.2   88  20  ?  ?  4  1 ...     50    85  2     2  3  2   
2  2   1   530334  38.30   40  24  1  1  3  1 ...  33.00  6.70  ?     ?  1  2   
3  1   9  5290409  39.10  164  84  4  1  6  2 ...  48.00  7.20  3  5.30  2  1   
4  2   1   530255  37.30  104  35  ?  ?  6  2 ...  74.00  7.40  ?     ?  2  2   

      24 25 26 27  
0  11300  0  0  2  
1   2208  0  0  2  
2      0  0  0  1  
3   2208  0  0  1  
4   4300  0  0  2  

[5 rows x 28 columns]


In [71]:
data.dtypes

0     object
1      int64
2      int64
3     object
4     object
5     object
6     object
7     object
8     object
9     object
10    object
11    object
12    object
13    object
14    object
15    object
16    object
17    object
18    object
19    object
20    object
21    object
22    object
23     int64
24     int64
25     int64
26     int64
27     int64
dtype: object

In [73]:
#Split X/Y
X = data.values[:, 0:27]
Y = data.values[:, 27]

#Set Missing values to 0; convert columns to numeric
X[X == '?'] = 0
X = X.astype('float32')

#Encode the target class as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
encoded_y = label_encoder.transform(Y)

#Split Train/Test sets
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, encoded_y, test_size = test_size, random_state = seed)

#Fit model
model = XGBClassifier()
model.fit(X_train, Y_train)
print(model)

#Predict
pred_y = model.predict(X_test)
predictions = [round(value) for value in pred_y]

#Evaluate
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
Accuracy: 83.84%


## Marking missing values as NaN
Doing this actually demonstrates a lift in accuracy for the model

In [75]:
#Set Missing values to 0; convert columns to numeric
X[X == '?'] = numpy.nan
X = X.astype('float32')

#Encode the target class as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
encoded_y = label_encoder.transform(Y)

#Split Train/Test sets
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, encoded_y, test_size = test_size, random_state = seed)

#Fit model
model = XGBClassifier()
model.fit(X_train, Y_train)
print(model)

#Predict
pred_y = model.predict(X_test)
predictions = [round(value) for value in pred_y]

#Evaluate
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
Accuracy: 84.85%


  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


## Imputing missing data

In [79]:
from sklearn.preprocessing import Imputer

#Set Missing values to 0; convert columns to numeric
X[X == '?'] = numpy.nan
X = X.astype('float32')

#Impute Missing valuses 
imputer = Imputer()
imputed_x = imputer.fit_transform(X)

#Encode the target class as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
encoded_y = label_encoder.transform(Y)

#Split Train/Test sets
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(imputed_x, encoded_y, test_size = test_size, random_state = seed)

#Fit model
model = XGBClassifier()
model.fit(X_train, Y_train)
print(model)

#Predict
pred_y = model.predict(X_test)
predictions = [round(value) for value in pred_y]

#Evaluate
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
Accuracy: 86.87%


