# Prediction using XGBoost

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

## Telco churn data

### Data loading and pre-processing

In [None]:
df = pd.read_csv('Telco_customer_churn.csv', delimiter=';')

In [None]:
df.head()

Drop unnecessary or unique columns

In [None]:
df.drop(columns=['Count','Country','State','CustomerID','Lat Long','Churn Label', 'Churn Score', 'CLTV', 'Churn Reason'], inplace=True)

Remove whitespace in city names and column names

In [None]:
df['City'].replace(' ', '_', regex=True, inplace=True)

In [None]:
df.columns = df.columns.str.replace(' ', '_')

Look at the datatypes

In [None]:
df.dtypes

Set empty strings / spaces to 0

In [None]:
df.loc[(df['Total_Charges']==' '),'Total_Charges'] = 0

In [None]:
df['Total_Charges'] = pd.to_numeric(df['Total_Charges'].str.replace(',', '.'))

In [None]:
df['Latitude'] = pd.to_numeric(df['Latitude'].str.replace(',', '.'))
df['Longitude'] = pd.to_numeric(df['Longitude'].str.replace(',', '.'))
df['Monthly_Charges'] = pd.to_numeric(df['Monthly_Charges'].str.replace(',', '.'))

Replace spaces in whole dataframe with _

In [None]:
df.replace(' ', '_', regex=True, inplace=True)

In [None]:
df

### Split data into dependent and independent variables

In [None]:
X = df.drop('Churn_Value', axis=1).copy()
y = df['Churn_Value'].copy()

### One-hot encoding

In [None]:
X.dtypes

All object columns need to be inspected and categorical data has to be encoded

In [None]:
X_encoded = pd.get_dummies(X, columns=['City', 'Gender','Partner', 'Dependents', 'Phone_Service', 'Multiple_Lines', \
                                      'Internet_Service', 'Online_Security', 'Online_Backup', 'Device_Protection',\
                                      'Tech_Support', 'Streaming_TV', 'Streaming_Movies', 'Contract', 'Paperless_Billing',\
                                      'Payment_Method', 'Senior_Citizen'])

In [None]:
X_encoded.head()

y should only contain 1s and 0s

In [None]:
y.unique()

### Build preliminary XGBoost Model

In [None]:
sum(y)/len(y)

--> percentage of people that left the company

--> we need to use stratification to ensure that the same percentage is present in both the train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42, stratify=y)

Verify that stratify worked:

In [None]:
print(sum(y_train)/len(y_train), sum(y_test)/len(y_test))

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', missing=1, seed=42, use_label_encoder=False)

In [None]:
clf_xgb.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_metric='aucpr', eval_set=[(X_test, y_test)])

In [None]:
plot_confusion_matrix(clf_xgb, X_test, y_test, values_format='d', display_labels=['Did not leave', 'Left'])

--> Not good because dataset is imbalanced!

Use scale_pos_weight penalty for incorrectly labeling the 

### Optimization using Cross Validation and GridSearch

In [None]:
#TODO: later

### Draw first tree

In [None]:
clf_xgb = xgb.XGBRegressor(objective='binary:logistic', missing=None, seed=42, use_label_encoder=True, n_estimators=1)
clf_xgb.fit(X_train, y_train)

In [None]:
xgb.to_graphviz(clf_xgb, num_trees=0, size='10,10')

## Wikipedia data

In [None]:
df_pop = pd.read_csv('../../data/popular_data.csv')

In [None]:
df_pop = df_pop[['infobox_key', 'property_name', 'template', 'days_diff']]

In [None]:
df_pop

In [None]:
df_pop_many = df_pop.groupby('infobox_key')['property_name'].count().reset_index().rename(columns={'property_name':'count'})

In [None]:
pop_keys = df_pop_many[df_pop_many['count']>10]['infobox_key'].tolist()

In [None]:
df_pop = df_pop[df_pop['infobox_key'].isin(pop_keys)]

In [None]:
df_pop.loc[:,'time_til_next_change'] = df_pop.groupby(['infobox_key', 'property_name'])['days_diff'].shift(-1)

In [None]:
df_pop.loc[:,'time_til_next_change'] = df_pop.loc[:,'time_til_next_change'].fillna(0)

In [None]:
df_pop.loc[:,'time_til_next_change'] = df_pop.loc[:,'time_til_next_change'].apply(lambda x: pd.Timedelta(x).total_seconds()/(3600*24))

In [None]:
df_pop.loc[:,'time_til_next_change'] = df_pop.loc[:,'time_til_next_change'].apply(lambda x: round(x))

In [None]:
df_pop.replace(' ', '_', regex=True, inplace=True)

In [None]:
df_pop['infobox_key'].replace('-','', regex=True, inplace=True)

In [None]:
df_pop['infobox_key'] = pd.to_numeric(df_pop['infobox_key'])

In [None]:
df_pop.drop(columns=['days_diff'], inplace=True)

In [None]:
df_pop.dtypes

In [None]:
df_pop.groupby('infobox_key')['property_name'].count()

In [None]:
def __groupby_slice( _grp, start=0, stop=None, step=1):
    '''
    Applies a slice to a GroupBy object
    '''
    return _grp.apply( lambda _df : _df.iloc[start:stop:step]).reset_index(drop=True)

In [None]:
__groupby_slice(df_pop.groupby('infobox_key'), -2)

In [None]:
train_set = __groupby_slice(df_pop.groupby('infobox_key'), 0, -2)

In [None]:
train_set.shape

In [None]:
test_set = __groupby_slice(df_pop.groupby('infobox_key'), -2)

In [None]:
test_set.shape

In [None]:
X_pop_train = train_set.drop('time_til_next_change', axis=1).copy()
y_pop_train = train_set['time_til_next_change'].copy()
X_pop_test = test_set.drop('time_til_next_change', axis=1).copy()
y_pop_test = test_set['time_til_next_change'].copy()

In [None]:
X_pop_train = pd.get_dummies(X_pop_train,columns=['property_name', 'template', 'infobox_key'])
X_pop_test = pd.get_dummies(X_pop_test,columns=['property_name', 'template', 'infobox_key'])

In [None]:
X_pop_train

### Build model

In [None]:
#X_pop_train, X_pop_test, y_pop_train, y_pop_test = train_test_split(X_pop_encoded, y_pop, random_state=42)

In [None]:
clf_xgb_pop = xgb.XGBRegressor(objective='reg:squarederror', missing=0, seed=42, n_estimators=100)

In [None]:
clf_xgb_pop.fit(X_pop_train, y_pop_train, verbose=True, early_stopping_rounds=10, eval_set=[(X_pop_test, y_pop_test)])

In [None]:
xgb.to_graphviz(clf_xgb_pop, num_trees=0, size='10,10')

In [None]:
clf_xgb_pop.predict(X_pop_test.iloc[:2])

In [None]:
y_pop_test.iloc[:2]