# Description

The idea of this educational project is to predict the churn of telecom company's clients. This problem is extremely important in practice and ML-algorithms are implemented to solve it in real telecom companies since it makes sense that if a company knows that a client is going to stop using its services then it can offer some extra bonuses to keep this client.

**Contact me in telegram:** @lawrence_grig

**Name on leaderboard:** Лаврентий_Григорян

# Import of modules

In [None]:
# Necessary libraries (for data analysis)
import numpy as np
import pandas as pd
import scipy.stats as ss
import statsmodels
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Plotly for interactive visualization
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots
import plotly
import plotly.graph_objs as go
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

# Some libraries and objects and functions for ML
import xgboost
import lightgbm
from catboost import CatBoostClassifier, cv, Pool
from sklearn.preprocessing import (RobustScaler, StandardScaler, MinMaxScaler,
                                   OneHotEncoder, LabelEncoder, PolynomialFeatures)
from sklearn.model_selection import (train_test_split, GridSearchCV, RandomizedSearchCV,
                                     cross_validate, cross_val_score, cross_val_predict)
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
                              AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier)
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix,
                             roc_auc_score, roc_curve)
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA

# Some display options
from tqdm.notebook import tqdm
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = 8, 4
# %config InlineBackend.figure_format = 'svg'
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data overview and primary data analysis

Firstly, let's read our datasets, concatenate them and mark training and testing samples with 1 and 0 respectively for convenience:

In [None]:
# Getting train and test samples
train = pd.read_csv('/kaggle/input/advanced-dls-fall-2020/train.csv')
test = pd.read_csv('/kaggle/input/advanced-dls-fall-2020/test.csv')

# Sample submission
sub = pd.read_csv('../input/advanced-dls-fall-2020/submission.csv')

# Making a copy of train
df = train.copy(deep=True)

In [None]:
# Train, test and dataset sizes
df.shape, test.shape

Let's take a look at our data and get some basic info about it:

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Getting features separately

# Numeric features
num_cols = [
            'ClientPeriod',
            'MonthlySpending',
            'TotalSpent'
]

# Categorical features
cat_cols = [
            'Sex',
            'IsSeniorCitizen',
            'HasPartner',
            'HasChild',
            'HasPhoneService',
            'HasMultiplePhoneNumbers',
            'HasInternetService',
            'HasOnlineSecurityService',
            'HasOnlineBackup',
            'HasDeviceProtection',
            'HasTechSupportAccess',
            'HasOnlineTV',
            'HasMovieSubscription',
            'HasContractPhone',
            'IsBillingPaperless',
            'PaymentMethod'
]

# All features
feature_cols = num_cols + cat_cols

# Target variable
target_col = 'Churn'

In [None]:
# Visualizing missing values
msno.matrix(df, figsize=(14, 7))
plt.title('Missing values visualization', size=15);

At this step we can see that:

1. The dataset contains 20 columns: 19 predictors and the target named "**Churn**"

2. There are 7029 observations

3. From the first view, the dataset contains **no missing values** which is the positive news

4. Obviously, "**TotalSpent**" is not a categorical feature - its data type requires a change

5. "**IsSeniorCitizen**" is already binary, let's convert it to "category" for convenient data analysis

In [None]:
# IsSeniorCitizen
df['IsSeniorCitizen'] = df['IsSeniorCitizen'].map({1: 'Yes', 0: 'No'})

In [None]:
# Checking TotalSpent values
df['TotalSpent'].value_counts()

Since we don't know yet what this missing values (in form of spaces) mean, let's substitute them with 0 for now:

In [None]:
# Replacing missing values and changing TotalSpent dtype
df['TotalSpent'] = df['TotalSpent'].str.replace(' ', '0').astype('float64')

In [None]:
# # Imputing TotalSpent with KNN - not correct
# imputer = KNNImputer()
# df['TotalSpent'] = imputer.fit_transform(pd.get_dummies(df.drop(['sample'], axis=1)))[:, 2]

In [None]:
print(f'Number of NaNs: {df.isna().sum().sum()}')

* So all the NaNs are imputed

In [None]:
# Changing dtypes of categorical features to pandas "category"
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

Let's check the data for duplicated rows/columns:

In [None]:
dup_cols = df.T.duplicated()
dup_rows = df.duplicated()
print(f'Number of duplicated columns: {dup_cols.sum()}\nNumber of duplicated rows: {dup_rows.sum()}')

In [None]:
# Duplicated rows
df[df.duplicated()]

* There are 14 duplicates

* We will decide later whether to delete them or not - this can be just the nature of the data

In [None]:
# # Dropping duplicates - reduced the model quality
# df.drop_duplicates(inplace=True)

**Descriptive statistics:**

In [None]:
# Numeric features
df.describe().iloc[:, :-1]

In [None]:
# Categorical features
df.describe(include=['category'])

> No super important conclusions for descriptive statistics.

Let's check our target variable and see if we deal with disbalanced class problem:

In [None]:
trace_0 = go.Bar(x=df['Churn'].value_counts().index,
                 y=df['Churn'].value_counts().values,
                 marker_color=['green', 'crimson'])

data = [trace_0]
layout = {'title': 'Churn bar plot',
          'xaxis': {'title': 'churn category'},
          'yaxis': {'title': 'number of observations'}}

fig = go.Figure(data=data, layout=layout)
fig.update_layout(height=600, width=800)
iplot(fig, show_link=False)

* As it was expected, there is a class disbalance in favor of non-churn clients which signals that such metrics as **accuracy** will be less efficient in future ML-model evaluation

* Probably we'll handle this problem a little bit later during ML-modelling.

# Exploratory Data Analysis

Now it makes sense to go over each feature and preprocess it separately.

In [None]:
# Training set
df.info()

For the purposes of EDA let's build some basic functions for convenience:

In [None]:
def show_unique(data, column: str):
    """
    Shows number of unique values
    for given data and column
    
    + value counts
    """
    print(f'Number of unique values for "{column}": {data[column].nunique()}\n')
    print(f'Value counts for {column}:', data[column].value_counts(), sep='\n')

In [None]:
def boxplot(data, column: str):
    """
    Plots interactive boxplot 
    for given numeric feature
    """
    trace = go.Box(y=data[column],
                   x=data['Churn'],
                   name=f'{column}')
    traces = [trace]
    layout = {'title': f'Boxplot for "{column}" depending on Churn'}
    
    fig = go.Figure(data=traces, layout=layout)
    fig.update_traces(marker_color='rgb(158,202,225)',
                      marker_line_color='rgb(8,48,107)',
                      marker_line_width=1.5, opacity=1)
    fig.update_layout(autosize=False,
                      width=800,
                      height=500)
    iplot(fig, show_link=False)

In [None]:
def distplot(data, column: str):
    """
    Plots interactive distplot 
    for given numeric feature
    """
    fig = ff.create_distplot([data[column].values],
                             group_labels=[column],
                             colors=['rgb(0, 200, 200)'])
    fig.update_layout(title_text=f'Distplot for "{column}"',
                      autosize=False,
                      width=800,
                      height=500)
    fig.show()

In [None]:
def norm_dist(data, column: str):
    """
    Plots logarithmic, box-cox 
    transformed and original
    data to check the presence of 
    normal distribution
    """
    fig, axes = plt.subplots(1, 3, figsize=(20, 5))
    
    # Original distribution
    sns.distplot(data[column], ax=axes[0])
    axes[0].set_title('Original distribution')
    
    # Log-normal distribution check
    sns.distplot(data[column].apply(lambda x: np.log(x + 1)), ax=axes[1],
                 color='green')
    axes[1].set_title('Log-normal distribution');
    
    # Distribution after box-cox
    try:
        sns.distplot(ss.boxcox(data[column])[0], ax=axes[2],
                     color='m')
        axes[2].set_title('Distribution after box-cox')
    except:
        print('Not all the data is positive => no box-cox transformation is possible')
        axes[2].remove()
        
    plt.show()

In [None]:
def barplot(data, column: str):
    """
    Plots interactive barplot
    for given categorical feature
    """
    trace_0 = go.Bar(x=data[column].value_counts().index,
                     y=data[column].value_counts().values,
                     marker_color='violet')

    traces = [trace_0]
    layout = {'title': 'Churn bar plot',
              'xaxis': {'title': 'churn category'},
              'yaxis': {'title': 'number of observations'}}

    fig = go.Figure(data=traces, layout=layout)
    fig.update_layout(height=600, width=800)
    iplot(fig, show_link=False)

## Numeric features

Conducting EDA for numeric features

### ClientPeriod

In [None]:
show_unique(df, 'ClientPeriod')

In [None]:
boxplot(df, 'ClientPeriod')

In [None]:
distplot(df, 'ClientPeriod')

According to "**ClientPeriod**" boxplot:

* There are some outliers in churn group 1


* No normal distribution can be observed

Let's check if the distribution becomes log-normal as we use logarithms:

In [None]:
norm_dist(df, 'ClientPeriod')

* The distribution is still not normal but at least a little closer to it

* We'll use log-normal distribution for "**ClientPeriod**" later

### MonthlySpending

In [None]:
show_unique(df, 'MonthlySpending')

In [None]:
boxplot(df, 'MonthlySpending')

In [None]:
distplot(df, 'MonthlySpending')

* We have no outliers according to IQR

* No normal distribution

In [None]:
# Plotting different normal dist transformations
norm_dist(df, 'MonthlySpending')

* After logarithmic transformation we got something that looks similar to bimodal distribution but not to normal one

* Box-cox transformation isn't really helpful in this case as well

* We'll also try using log-normal distribution for this feature later

## TotalSpent

In [None]:
show_unique(df, 'TotalSpent')

In [None]:
boxplot(df, 'TotalSpent')

In [None]:
distplot(df, 'TotalSpent')

* According to IQR the column doesn't contain any outliers

* The distribution of this feature is extremely left-skewed

* So in general most people spend up to 4000 (conventional units) but there are many of those who spend much more

In [None]:
# Plotting different normal dist transformations
norm_dist(df, 'TotalSpent')

* So after box-cox trasformation the distribution of **TotalSpent** became much closer to a normal one


* We will try to use this one for ML model and check if it performs better

As we remember, this feature contained nan-values which were replaced by zeros. Let's take a closer look to these observations:

In [None]:
train.query('TotalSpent == " " and ClientPeriod == 0')

In [None]:
zeros = train.query('ClientPeriod == 0 and TotalSpent != " "').shape[0]
print(f'Number of observations with zero ClientPeriod and non-zero TotalSpent: {zeros}')

* What is noticable - all clients with missing **TotalSpent** have zero **ClientPeriod** and zero **Churn**

* Probably the first idea of imputing **TotalSpent** with zeros was logically correct

### Dealing with outliers:

Earlier we revealed that there are outliers in some numeric columns in Churn 1 group

In [None]:
# Getting rid of outliers using IQR
for num_col in num_cols:
    Q1 = df[df['Churn'] == 1][num_col].quantile(0.25)
    Q3 = df[df['Churn'] == 1][num_col].quantile(0.75)
    IQR = Q3 - Q1
    left_lim = Q1 - 1.5*IQR
    right_lim = Q3 + 1.5*IQR
    outliers = np.where((df[df['Churn'] == 1][num_col] > right_lim) |\
                        (df[df['Churn'] == 1][num_col] < left_lim))[0]
    print(f'Number of outliers fo {num_col}: {len(outliers)}')
    if num_col == 'ClientPeriod':
        df.drop(outliers, errors='ignore', inplace=True)

In [None]:
df.info()

In [None]:
# # Using log-normal distribution
df['TotalSpent'] = df['TotalSpent'].apply(lambda x: np.log(x + 1))

# Categorical features

EDA for categorical features:

Some basic functions for categorical features:

In [None]:
def pie_bar(data, column):
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes[0].pie(data[column].value_counts(),
                labels=data[column].value_counts().index,
                autopct='%1.1f%%', shadow=True, startangle=90)
    axes[0].set_title(f'Pie chart for "{column}"')
    
    sns.barplot(x=data[column].astype(str).value_counts().index,
                y=data[column].value_counts().values,
                palette='Spectral', ax=axes[1])
    axes[1].set_title(f'Bar plot for "{column}"')
    plt.xticks(rotation=90)
    plt.subplots_adjust(wspace=0.6);

## Sex

In [None]:
show_unique(df, 'Sex')

In [None]:
pie_bar(df, 'Sex')

* We have almost an equal number of male and female clients

In [None]:
# Differences between spendings among male and female clients
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].set_title('Monthly spending by sex', size=16)
axes[1].set_title('Total spending by sex', size=16)
sns.boxplot(df['MonthlySpending'], y=df['Sex'], ax=axes[0], palette="Set3")
sns.boxplot(df['TotalSpent'], y=df['Sex'], ax=axes[1], palette="magma")
plt.subplots_adjust(wspace=0.6);

* Visually there are no significant differences between spendings of men and women

It seems also quite important to check if **Sex** affects the chance that a client will go away:

In [None]:
sns.countplot(df['Sex'],
              hue=df['Churn'],
              palette='Blues')
plt.title('Churn by sex');

* Well, here we also have no significant difference

* We'll decide later whether to include this feature in ML model or not

## IsSeniorCitizen

In [None]:
show_unique(df, 'IsSeniorCitizen')

In [None]:
pie_bar(df, 'IsSeniorCitizen')

* Most clients are not seniors

## HasPartner

In [None]:
show_unique(df, 'HasPartner')

In [None]:
pie_bar(df, 'HasPartner')

* Almost equal amounts of people have and don't have a partner (i. e. a girlfriend/wife, a boyfriend/husband)

## HasChild

In [None]:
show_unique(df, 'HasChild')

In [None]:
pie_bar(df, 'HasChild')

* About twice more people have no children

## HasPhoneService

In [None]:
show_unique(df, 'HasPhoneService')

In [None]:
pie_bar(df, 'HasPhoneService')

* The majority of people have phone service - quite an obvious result

**Too much time spent on plotting each cat-feature one by one.**

Let's go over other features in a cycle:

In [None]:
for cat_col in cat_cols[5:]:
    show_unique(df, cat_col)
    pie_bar(df, cat_col)
    print()

* Nothing extra-ordinary was noticed for other categorical features

# Correlation analysis

Conducting correlation analysis of dataset will help us to check if there is a **multicollinearity** problem and to understand better **which features are correlated with our target variable**:

## Numeric features

In [None]:
# Scatter plots and dists
sns.pairplot(df[num_cols + [target_col]], hue=target_col);

* TotalSpent and ClientPeriod are quite dependent

* In general, the more a person pays monthly and the less his/her client period is the more is the chance that he/she will stop using telecom's services

In [None]:
# Checking differences on boxplots
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
for i, num_col in enumerate(num_cols, start=0):
    sns.boxplot(y=num_col, x="Churn", data=df, ax=axes[i], palette='husl')
    axes[i].set_title(f'Boxplots for {num_col} by Churn groups')

* Visually we can see clear differences between groups

There are several statistical ways to check the dependency between categorical and numeric feature but we'll use **non-parametric Mann-Whitney test** (since the assumption of normal distribution is not met) to check the differences in group means:

In [None]:
for num_col in num_cols:
    sample_1, sample_2 = df[df['Churn'] == 1][num_col], df[df['Churn'] == 0][num_col]
    if sample_1.shape[0] > 20 and sample_2.shape[0] > 20:
        mwu = ss.mannwhitneyu(sample_1, sample_2)
    else:
        raise Exception ('At least one of the samples is too small statistically')
    p_value = mwu.pvalue
    if p_value < 0.01:
        print(f'With 99% of confidence significant differences were found for "{num_col}" Churn groups')

* So all the numeric features are important, we'll include them in ML-model

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 6))
sns.heatmap(df[num_cols + [target_col]].corr(), cmap='Blues', annot=True, fmt='.3f', linewidth=0.6)
plt.title('Correlation matrix for numeric features and target');

* Two columns (**TotalSpent** and **ClientPeriod**) correlate a lot -> it's close to **multicollinearity** (corr_coef > 0.8)

* It seems quite natural that the bigger a client's period is, the more this client has already spent

* It is reasonable to conclude that all the numeric features explain our target variable well enough

In [None]:
# # # Deleting MonthlySpending to avoid multicollinearity
# df.drop(['TotalSpent'], axis=1, inplace=True)
# num_cols.remove('TotalSpent')

## Categorical features

To check dependencies between categorical features and target, we'll use **Chi2-test**:

In [None]:
stats = []
imp_stats = []
imp_cat_cols = []
for cat_col in cat_cols:
    contingency_table = pd.crosstab(df[cat_col], df['Churn'])
    chi2_stat, p_value = ss.chi2_contingency(observed=contingency_table)[:2]
    stats.append(chi2_stat)
    if p_value < 0.01:
        imp_stats.append(chi2_stat)
        imp_cat_cols.append(cat_col)
        print(f'With 99% of confidence significant differences found for {cat_col}')

In [None]:
print(f'Number of significant categorical features: {len(imp_stats)}')

In [None]:
print('Significant categorical columns:\n\n', np.array(imp_cat_cols))

In [None]:
# Checking chi2 statistics
plt.figure(figsize=(12, 6))
series = pd.Series(stats, index=cat_cols).sort_values(ascending=True)
series.plot(kind='barh')
plt.title('Chi2 statistics for categorical features');

* Two columns were considered insignificant: **Sex**, **HasPhoneService**

* So my previous assumption on that **Sex** plays no role in defining **Churn** is now proved statistically

In [None]:
# Dropping insignificant categorical columns
df.drop(['Sex', 'HasPhoneService'], axis=1, inplace=True)

# Removing them from cat_cols list
cat_cols.remove('Sex')
cat_cols.remove('HasPhoneService')
feature_cols = num_cols + cat_cols

# Feature Engineering

### Polynomial features

* Probably there is a more complicated linear dependency between features and target

* Polynomials (interactions between numeric features) might help to predict target better in this cas

In [None]:
# # Trying out polynomial features and interactions of features
# poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
# poly.fit(df[num_cols])
# columns = poly.get_feature_names(num_cols)
# df_poly = pd.DataFrame(poly.transform(df[num_cols]),
#                        columns=columns)
# df = pd.concat([df, df_poly], axis=1)
# num_cols.extend(df_poly.columns)

> Nothing positive came out of this experiment 

> Polynomials are useless in this case - lead to overfitting

# Machine Learning

Now let's train a basic ML-model and then try to improve it by feature engineering and other interesting stuff

In [None]:
df.info()

In [None]:
df.head()

**Preprocessing for ML:**

In [None]:
# X and y in DF form
X_df = df[feature_cols]
y = df[target_col]

# Got dummies 
dummies = pd.get_dummies(df[cat_cols], drop_first=True)

# Concatenating with numeric features
X = pd.concat([df[num_cols], dummies], axis=1)

In [None]:
# Splitting data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

# Scaling numeric train data
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Sample sizes
X.shape, y.shape, X_train.shape, y_train.shape, X_val.shape, y_val.shape

### Logistic Regression:

In [None]:
# Logistic Regression model
lr_cv = LogisticRegressionCV(Cs=np.linspace(1, 10, 10),
                             cv=5, scoring='roc_auc',
                             solver='lbfgs', max_iter=500,
                             refit=True)
lr_cv.fit(X_train_scaled, y_train)

# Predicting
y_pred = lr_cv.predict(X_val_scaled)
y_pred_proba = lr_cv.predict_proba(X_val_scaled)[:, 1]
y_train_pred_proba = lr_cv.predict_proba(X_train_scaled)[:, 1]

print(f'ROC-AUC LR CV: {roc_auc_score(y_val, y_pred_proba)}\nF1-Score LR CV: {f1_score(y_val, y_pred)}')

In [None]:
# Plotting roc-auc curve
train_auc = roc_auc_score(y_train, y_train_pred_proba)
val_auc = roc_auc_score(y_val, y_pred_proba)

plt.figure(figsize=(10,5))
plt.plot(*roc_curve(y_train, y_train_pred_proba)[:2], label='train AUC={:.4f}'.format(train_auc))
plt.plot(*roc_curve(y_val, y_pred_proba)[:2], label='test AUC={:.4f}'.format(val_auc))
legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
legend_box.set_facecolor("white")
legend_box.set_edgecolor("black")
plt.plot(np.linspace(0,1,100), np.linspace(0,1,100))
plt.title('ROC-AUC for train and validation set')
plt.show()

# Pipeline

### Pipeline structure:

1) Use custom transformer to add new features

1) Use **ColumnTransformer**: to drop the columns that are not required for model training, to scale numeric features using *RobustScaler()*


2) Use ML model to predict **Churn**

In [None]:
# # Step 1 - Preprocess TotalSpent - use for input data
# class ColumnFormatter(BaseEstimator):
#     """
#     Changes TotalSpent column values
#     to logarithmic values
#     """
#     def __init__(self):
#         pass
    
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         # Formatting TotalSpent
#         X['TotalSpent'] = X['TotalSpent'].replace(' ', '0')
#         X['TotalSpent'] = X['TotalSpent'].astype('float64')
#         X['TotalSpent'] = X['TotalSpent'].apply(lambda x: np.log(x + 1))
        
#         return X

In [None]:
# Step 2 - preprocess the other data
preprocess = ColumnTransformer(remainder='passthrough',
                               transformers=[
                                            
                                             ('binary_data', OneHotEncoder(drop='first'), ['IsSeniorCitizen',
                                                                                           'HasMultiplePhoneNumbers',
                                                                                           'HasPartner',
                                                                                           'HasChild',
                                                                                           'HasInternetService',
                                                                                           'HasOnlineSecurityService',
                                                                                           'HasOnlineBackup',
                                                                                           'HasDeviceProtection',
                                                                                           'HasTechSupportAccess',
                                                                                           'HasOnlineTV',
                                                                                           'HasMovieSubscription',
                                                                                           'HasContractPhone',
                                                                                           'IsBillingPaperless',
                                                                                           'PaymentMethod']),
                                            
                                             ('scale_data', RobustScaler(), ['ClientPeriod',
                                                                            'MonthlySpending',
                                                                            'TotalSpent'])
                                            ])

In [None]:
# Create a pipeline
pip_log = Pipeline(steps=[
                          ('pre_processing', preprocess),
                          ('logistic_regression', LogisticRegression(max_iter=1000, 
                                                                     random_state=123))
                         ]
                  )

# Splitting the original data
X_train, X_val, y_train, y_val = train_test_split(X_df, y,
                                                  test_size=0.2,
                                                  random_state=123)

In [None]:
# Grid search for pipeline model
params = {'logistic_regression__C': np.linspace(0, 10, 10),
          'logistic_regression__class_weight': ['none', 'balanced'],
          'logistic_regression__solver': ['newton-cg', 'lbfgs', 'sag']}

grid = GridSearchCV(pip_log, params, cv=5, n_jobs=-1, scoring='roc_auc', verbose=1)
grid.fit(X_train, y_train)

In [None]:
print(f'Best score for pipeline: {grid.best_score_}')
print(f'Best pipeline: {grid.best_estimator_}')

In [None]:
# Plotting roc-auc curve
grid_train_pred_proba = grid.predict_proba(X_train)[:, 1]
grid_val_pred_proba = grid.predict_proba(X_val)[:, 1]

train_auc = roc_auc_score(y_train, grid_train_pred_proba)
val_auc = roc_auc_score(y_val, grid_val_pred_proba)

plt.figure(figsize=(10,5))
plt.plot(*roc_curve(y_train, grid_train_pred_proba)[:2], label='train AUC={:.4f}'.format(train_auc))
plt.plot(*roc_curve(y_val, grid_val_pred_proba)[:2], label='test AUC={:.4f}'.format(val_auc))
legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
legend_box.set_facecolor("white")
legend_box.set_edgecolor("black")
plt.plot(np.linspace(0,1,100), np.linspace(0,1,100))
plt.title('ROC-AUC for train and validation set')
plt.show()

* Seems like this model predicts a little better on test set

In [None]:
# Defining best logistic regression model
log_reg_best = grid.best_estimator_

### Gradient boosting

We'll use Yandex CatBoost library which is quite convenient to work with categoricals and gives accurate results with no hypeparameter tuning.

In [None]:
# Splitting the data with no dummies
X_train, X_val, y_train, y_val = train_test_split(X_df, y, test_size=0.2,
                                                    random_state=123)

In [None]:
# Training catboost classifier model
cbc = CatBoostClassifier(custom_loss='AUC', cat_features=cat_cols,
                         verbose=False)
cbc.fit(X_train, y_train, eval_set=(X_val, y_val), plot=True)
cbc_val_pred_proba = cbc.predict_proba(X_val)[:, 1]
cbc_train_pred_proba = cbc.predict_proba(X_train)[:, 1]

# Checking roc-auc metrics
print(f'Validation ROC-AUC: {roc_auc_score(y_val, cbc_val_pred_proba)}')
print(f'Training ROC-AUC: {roc_auc_score(y_train, cbc_train_pred_proba)}')

In [None]:
# # catboost grid search cv
# cbc_grid = CatBoostClassifier(n_estimators=500, silent=True,
#                               cat_features=cat_cols, eval_metric='AUC')

# cbc_grid.grid_search({'l2_leaf_reg': np.linspace(0, 10, 20),
#                       'depth': np.arange(2, 10, 2)},
#                      X_df, y, cv=5,
#                      plot=True, refit=True)

In [None]:
# # Best parameters
# best_params = cbc_grid.get_params()

params = {'silent': True,
          'eval_metric': 'AUC',
          'n_estimators': 500,
          'cat_features': ['IsSeniorCitizen',
                          'HasPartner',
                          'HasChild',
                          'HasMultiplePhoneNumbers',
                          'HasInternetService',
                          'HasOnlineSecurityService',
                          'HasOnlineBackup',
                          'HasDeviceProtection',
                          'HasTechSupportAccess',
                          'HasOnlineTV',
                          'HasMovieSubscription',
                          'HasContractPhone',
                          'IsBillingPaperless',
                          'PaymentMethod'],
          'depth': 4,
          'l2_leaf_reg': 2}


# Best model
cbc_best = CatBoostClassifier(**params)
cbc_best.fit(X_train, y_train, eval_set=(X_val, y_val), plot=True) 

In [None]:
# Checking the metric on validation set
best_metric = roc_auc_score(y_val, cbc_best.predict_proba(X_val)[:, 1])
print(f"Best gradient boosting score: {best_metric}")

In [None]:
# Preprocessing testing set
test['TotalSpent'] = test['TotalSpent'].replace(' ', '0').astype('float64')
test['TotalSpent'] = test['TotalSpent'].apply(lambda x: np.log(x + 1))
test['IsSeniorCitizen'] = test['IsSeniorCitizen'].astype('category')
test.drop(['Sex', 'HasPhoneService'], axis=1, inplace=True)
test.head()

In [None]:
# Final predictions
sub['Churn'] = cbc_best.predict_proba(test)[:, 1]
sub.to_csv('submission.csv', index=False)
display(sub)