# Classification with XGBoost 

In [11]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [9]:
from sklearn.model_selection import train_test_split

In [5]:
churn_data = pd.read_csv('churn_data.txt')

In [6]:
churn_data.head()

Unnamed: 0.1,Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_inc_price,inc_pct,weekday_pct,fancy_car_user,city_Carthag,city_Harko,phone_iPhone,first_month_cat_more_1_trip,first_month_cat_no_trips,month_5_still_here
0,0,3.67,5.0,4.7,1.1,15.4,46.2,True,0,1,1,1,0,1
1,1,8.26,5.0,5.0,1.0,0.0,50.0,False,1,0,0,0,1,0
2,2,0.77,5.0,4.3,1.0,0.0,100.0,False,1,0,1,1,0,0
3,3,2.36,4.9,4.6,1.14,20.0,80.0,True,0,1,1,1,0,1
4,4,3.13,4.9,4.4,1.19,11.8,82.4,False,0,0,0,1,0,0


In [7]:
churn_data.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=.2, random_state=123)

In [13]:
X_train.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_inc_price,inc_pct,weekday_pct,fancy_car_user,city_Carthag,city_Harko,phone_iPhone,first_month_cat_more_1_trip,first_month_cat_no_trips
2660,3.5,5.0,5.0,1.0,0.0,100.0,False,0,0,0,0,0
32815,4.51,4.6,5.0,1.0,0.0,75.0,False,0,0,0,1,0
35141,16.38,5.0,,1.0,0.0,100.0,True,0,0,1,0,1
19390,6.6,5.0,5.0,1.0,0.0,0.0,False,0,0,1,0,0
34846,5.62,5.0,,1.0,0.0,0.0,False,1,0,0,0,0


In [14]:
y_train.head()

2660     0
32815    0
35141    0
19390    0
34846    0
Name: month_5_still_here, dtype: int64

In [12]:
# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

accuracy: 0.758200


## Decision tree

In [15]:
dx = pd.read_csv('decision_tree_x.csv')

In [16]:
dy = pd.read_csv('decision_tree_y.csv')

In [20]:
dx.drop(columns=['Unnamed: 0'], inplace=True)
dy.drop(columns=['Unnamed: 0'], inplace=True)

In [21]:
dx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [29]:
# Import the necessary modules
from sklearn.tree import DecisionTreeClassifier

# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(dx.values, dy.values, test_size=.2, random_state=123)

# Instantiate the classifier: dt_clf_4
dt_clf_4 = DecisionTreeClassifier(max_depth=4)

# Fit the classifier to the training set
dt_clf_4.fit(X_train, y_train)

# Predict the labels of the test set: y_pred_4
y_pred_4 = dt_clf_4.predict(X_test)

# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(y_pred_4==y_test))/y_test.shape[0]
print("accuracy:", accuracy)

accuracy: 61.771929824561404


## Measuring accuracy

In [30]:
# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the DMatrix from X and y: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

   train-error-mean  train-error-std  test-error-mean  test-error-std
0           0.28232         0.002366          0.28378        0.001932
1           0.26951         0.001855          0.27190        0.001932
2           0.25605         0.003213          0.25798        0.003963
3           0.25090         0.001845          0.25434        0.003827
4           0.24654         0.001981          0.24852        0.000934
0.75148


## Measuring AUC

In [31]:
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                  nfold=3, num_boost_round=5, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.768893       0.001544       0.767863      0.002820
1        0.790864       0.006758       0.789156      0.006847
2        0.815872       0.003899       0.814476      0.005997
3        0.822959       0.002018       0.821682      0.003912
4        0.827528       0.000769       0.826191      0.001938
0.8261913333333334
