In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [12,8]

### Introducing XGBoost
- Opitmied gradient-boosting machine-learning library
- Originally written in C++
- Has APIs in several languages:
    - Python, R, Scala, Julia, Java

#### What make XGBoost so popular?
- Speed and performance
- Core algorithm is parallelizable
- Consistently outperforms single-algorithm methods
- State of the art performance in many ML tasks

#### XGBoost: a quick example
```python
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

class_data = pd.read_csv('classification.csv')
X, y = class_data.iloc[:,:-1], class_data.iloc[;,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   test_size=0.2,
                                   random_state=123)

xg_cl = xgb.XGBClassifier(objective='binary:logistic',
                          n_estimators=10,
                          seed=123)

xg_cl.fit(X_train, y_train)

preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]

print('accuracy: %f' % (accuracy))
```

In [3]:
def encode_objects(df, column):
    """
    Simple label encoding for a dataframe column
    
    Parameters: 
    df : pandas DataFrame
    col: column to be encoded
  
    Returns: 
    df[col_enc]: new column with values encoded  
    
    """
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df['{}_enc'.format(column)] = le.fit_transform(df[column])
    df.drop(column, axis=1, inplace=True)

In [14]:
churn_data = pd.read_csv('data/telco_churn.csv')
churn_data_obj_columns = churn_data.select_dtypes(include=['object'])
for col in churn_data_obj_columns:
    encode_objects(churn_data, col)
churn_data = churn_data.drop(labels = 'customerID_enc', axis=1)
y = churn_data["Churn_enc"].values
X = churn_data.drop(labels = ["Churn_enc"],axis = 1)

In [16]:
# Import xgboost
import xgboost as xgb

# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the training and test sets
# X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

accuracy: 0.784244


### Decison trees as base learners
- Base learner - Individual learning algorithm in an ensemble algorithm
- Composed of a series of binary questions
- Predictions happen at the leaves of the tree
- Constructed iteratively (one decision at a time)
    - Until a stopping criterion is met

### CART: Classification and Regression Trees
- Each leaf always contains a real-valued score
- Can later be converted into categories 


In [18]:
from sklearn.datasets import load_breast_cancer
X = load_breast_cancer().data
y = load_breast_cancer().target

In [22]:
# Import the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the classifier: dt_clf_4
dt_clf_4 = DecisionTreeClassifier(max_depth=4)

# Fit the classifier to the training set
dt_clf_4.fit(X_train, y_train)

# Predict the labels of the test set: y_pred_4
y_pred_4 = dt_clf_4.predict(X_test)

# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(y_pred_4==y_test))/y_test.shape[0]
print("accuracy:", accuracy)

accuracy: 0.9736842105263158


### Boosting overview
- Not a specific machine learning algorithm 
- Concept that can be applied to a set of machine learning models
    - "Meta-algorithm"
- Ensemble meta-algorithm ues to convert many weak learners into a strong learner

### Weak learners and strong learners
- Weak learner: ML algorithm that is slightly better than chance
    - Example: Decision tree whose predictions are slightly better that 50%
- Boosting converts a collection of weak learners into a strong learner
- Strong learner: Any algorithm that can be tuned to achieve good performance

### How boosting is accomplished
- Iteratively learning a set of weak models on subsets of the data
- Weighing each weak prediction according to each weak learner's performance
- Combine the weighted predictions to obtain a single weighted prediction

### Model evaluation through cross-validation
- Cross validation: Robust method for estimating the performance of a model on unseen data 
- Generates many non-overlapping train/test splits on training data
- Reports the average test set performance across all data splits

```python
import xgboost as xgb
import pandas as pd

churn_data = pd.read_csv('classificatio_data.csv')

churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, :-1],
                            label=churn_data.month_5_still_here)

params = {'objective':'binary:logistic', 'max_depth':4}

cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
                    num_boost_round=10, metrics = 'error', as_pandas=True)

print("Accuracy: %f" %((1-cv_results['test-error-mean']).iloc[-1]))
```

In [23]:
churn_data = pd.read_csv('data/telco_churn.csv')
churn_data_obj_columns = churn_data.select_dtypes(include=['object'])
for col in churn_data_obj_columns:
    encode_objects(churn_data, col)
churn_data = churn_data.drop(labels = 'customerID_enc', axis=1)
y = churn_data["Churn_enc"].values
X = churn_data.drop(labels = ["Churn_enc"],axis = 1)

# Create the DMatrix from X and y: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.217876         0.003688         0.220787        0.010877
1          0.202968         0.005986         0.205311        0.008783
2          0.200838         0.004909         0.204033        0.006955
3          0.199205         0.002299         0.206730        0.006264
4          0.197430         0.001693         0.204743        0.007826
0.7952573333333334


In [24]:
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.812836       0.003338       0.807141      0.005519
1        0.834186       0.004590       0.826842      0.006371
2        0.840441       0.002271       0.830180      0.006882
3        0.844604       0.001277       0.833729      0.007002
4        0.846892       0.000626       0.834883      0.007649
0.8348826666666667


### When should I use XGBoost?
- You have a large number of training examples
    - Greater than 1000 training samples and less 100 features
    - The number of features < number of training samples
- You have a mixture of categorical and numeric features
    - Or just numeric features
### When to NOT use XGBoost
- Image recognition
- Computer vision
- Natural language processing and understanding problems