In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import tqdm 
import matplotlib.pyplot as plt
import math
import scipy

In [4]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

In [5]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder,OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [6]:
from sklearn.metrics import roc_auc_score, roc_curve,confusion_matrix

# Loading Raw Data

In [7]:
raw_data = pd.read_csv('telecomChurn.zip')

In [36]:
raw_data.head(5)

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays,Customer_ID
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,0.0,N,U,U,U,U,U,Y,361.0,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


## Handling column descriptions

Since there is literally a hundred columns in this dataset, lets take a general look at what kind of data is available. 

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import OPTICS, KMeans

In [98]:
col_descriptions = pd.read_json('columnDescriptions.json',orient='index')[0].rename('col_descriptions').to_frame()

In [99]:
col_descriptions.head(5)

Unnamed: 0,col_descriptions
rev_Mean,Mean monthly revenue (charge amount)
mou_Mean,Mean number of monthly minutes of use
totmrc_Mean,Mean total monthly recurring charge
da_Mean,Mean number of directory assisted calls
ovrmou_Mean,Mean overage minutes of use


In [153]:
tfidf = TfidfVectorizer(stop_words='english')

In [157]:
X = tfidf.fit_transform(col_descriptions['col_descriptions'])

In [171]:
tfidf.get_feature_names_out()

array(['10', '11', '15', '16', '17', '31', '60', 'account', 'active',
       'adjusted', 'adults', 'age', 'area', 'assisted', 'attempted',
       'average', 'billing', 'blocked', 'calls', 'capability', 'card',
       'care', 'cell', 'change', 'charge', 'child', 'churn', 'class',
       'code', 'completed', 'credit', 'current', 'custcare_mean',
       'customer', 'data', 'date', 'days', 'directory', 'dropped',
       'dualband', 'dummy', 'dwelling', 'equipment', 'estimated',
       'ethnicity', 'failed', 'foreign', 'forwarding', 'geogrpahic',
       'group', 'handset', 'handsets', 'home', 'household', 'inbound',
       'income', 'indicator', 'infobase', 'instance', 'issued', 'known',
       'length', 'letter', 'life', 'limit', 'marital', 'match', 'mean',
       'minute', 'minutes', 'models', 'month', 'monthly', 'months', 'new',
       'number', 'observation', 'outbound', 'overage', 'owner', 'peak',
       'percentage', 'phone', 'placed', 'premier', 'previous', 'price',
       'received'

### Optics grouping

In [199]:
optics = OPTICS(min_samples=3)
col_descriptions['optics_class'] = optics.fit_predict(X.toarray())

  ratio = reachability_plot[:-1] / reachability_plot[1:]


In [200]:
col_descriptions.optics_class.value_counts()

-1     50
 6      6
 3      6
 11     5
 5      5
 2      5
 0      5
 1      3
 10     3
 9      3
 8      3
 7      3
 4      3
Name: optics_class, dtype: int64

In [201]:
ls = []
for group in col_descriptions['optics_class'].unique():
    ls.append(col_descriptions[col_descriptions['optics_class']==group]['col_descriptions'].to_numpy())

In [204]:
ls

[array(['Mean monthly revenue (charge amount)',
        'Mean total monthly recurring charge', ' N', ' N', 'N'],
       dtype=object),
 array(['Mean number of monthly minutes of use',
        'Mean number of directory assisted calls',
        'Mean overage minutes of use', 'Mean number of roaming calls',
        'Percentage change in monthly minutes of use vs previous three month average',
        ' Percentage change in monthly revenue vs previous three month average',
        'Mean number of unanswered voice calls',
        'Mean number of unanswered data calls',
        'Mean number of received voice calls',
        'Mean number of inbound calls less than one minute',
        'Mean number of three way calls ',
        'Mean unrounded minutes of use of received voice calls ',
        'Mean number of inbound and outbound peak voice calls',
        'Mean number of peak data calls',
        'Mean number of off-peak voice calls',
        'Mean number of off-peak data calls',
        ' Mea

### kmeans grouping

In [172]:
kmeans = KMeans(n_clusters=10)
col_descriptions['kmeans_class'] = kmeans.fit_predict(X.toarray())

In [173]:
col_descriptions['kmeans_class'].value_counts()

4    48
0    11
1    10
3     8
9     6
8     5
5     5
6     3
7     2
2     2
Name: kmeans_class, dtype: int64

In [174]:
ls = []
for group in col_descriptions['kmeans_class'].unique():
    ls.append(col_descriptions[col_descriptions['kmeans_class']==group]['col_descriptions'].to_numpy())

In [211]:
col_descriptions[col_descriptions['kmeans_class']==0]['col_descriptions'].to_numpy()

array(['Mean number of monthly minutes of use',
       'Mean overage minutes of use',
       'Mean rounded minutes of use of customer care calls',
       'Mean unrounded minutes of use of customer care (see CUSTCARE_MEAN) calls',
       'Mean unrounded minutes of use of completed voice calls',
       'Mean unrounded minutes of use of completed data calls',
       'Mean unrounded minutes of use of received voice calls ',
       'Mean unrounded minutes of use of peak voice calls',
       'Mean unrounded minutes of use of peak data calls',
       'Mean unrounded minutes of use of off-peak voice calls',
       'Mean unrounded minutes of use of off-peak data calls'],
      dtype=object)