In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import tqdm 
import matplotlib.pyplot as plt
import math
import scipy
import re

In [2]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder,OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [4]:
from sklearn.metrics import roc_auc_score, roc_curve,confusion_matrix

# Loading Raw Data

In [5]:
raw_data = pd.read_csv('telecomChurn.zip')

In [6]:
raw_data.head(5)

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays,Customer_ID
0,23.9975,219.25,22.5,0.2475,0.0,0.0,0.0,0.0,0.0,-157.25,...,0.0,N,U,U,U,U,U,Y,361.0,1000001
1,57.4925,482.75,37.425,0.2475,22.75,9.1,9.1,0.0,0.0,532.25,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,16.99,10.25,16.99,0.0,0.0,0.0,0.0,0.0,0.0,-4.25,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38.0,7.5,38.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.5,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,55.23,570.5,71.98,0.0,0.0,0.0,0.0,0.0,0.0,38.5,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


## Handling column descriptions

Since there is literally a hundred columns in this dataset, lets take a general look at what kind of data is available. 

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import OPTICS, KMeans
from sklearn.feature_extraction.text import CountVectorizer
import gensim

In [73]:
col_descriptions = pd.read_json('columnDescriptions.json',orient='index')[0].rename('col_descriptions').to_frame()

In [74]:
col_descriptions.head(5)

Unnamed: 0,col_descriptions
rev_Mean,Mean monthly revenue (charge amount)
mou_Mean,Mean number of monthly minutes of use
totmrc_Mean,Mean total monthly recurring charge
da_Mean,Mean number of directory assisted calls
ovrmou_Mean,Mean overage minutes of use


In [75]:
col_descriptions['clean_desc'] = (
    col_descriptions['col_descriptions']
    .apply(gensim.parsing.preprocessing.remove_stopwords)
)

In [76]:
col_descriptions

Unnamed: 0,col_descriptions,clean_desc
rev_Mean,Mean monthly revenue (charge amount),Mean monthly revenue (charge amount)
mou_Mean,Mean number of monthly minutes of use,Mean number monthly minutes use
totmrc_Mean,Mean total monthly recurring charge,Mean total monthly recurring charge
da_Mean,Mean number of directory assisted calls,Mean number directory assisted calls
ovrmou_Mean,Mean overage minutes of use,Mean overage minutes use
...,...,...
kid11_15,Child 11 - 15 years of age in household,Child 11 - 15 years age household
kid16_17,Child 16 - 17 years of age in household,Child 16 - 17 years age household
creditcd,Credit card indicator,Credit card indicator
eqpdays,Number of days (age) of current equipment,Number days (age) current equipment


### Bag of words

In [12]:
tokenizedDesc = col_descriptions['clean_desc'].apply(lambda x: x.split()).to_list()
dictionary = gensim.corpora.Dictionary(tokenizedDesc)
corpus = [dictionary.doc2bow(text) for text in tokenizedDesc]

### Tfidf

In [13]:
tfidf = TfidfVectorizer()
tfidfMatrix = tfidf.fit_transform(col_descriptions['clean_desc'])

In [14]:
tfidf.get_feature_names_out()

array(['10', '11', '15', '16', '17', '31', '60', 'account', 'active',
       'adjusted', 'adults', 'age', 'amount', 'area', 'assisted',
       'attempted', 'average', 'billing', 'blocked', 'calls',
       'capability', 'card', 'care', 'cell', 'change', 'charge', 'child',
       'churn', 'class', 'code', 'completed', 'credit', 'current',
       'custcare_mean', 'customer', 'data', 'date', 'days', 'directory',
       'dropped', 'dualband', 'dummy', 'dwelling', 'equipment',
       'estimated', 'ethnicity', 'failed', 'foreign', 'forwarding',
       'geogrpahic', 'group', 'handset', 'handsets', 'home', 'household',
       'inbound', 'income', 'indicator', 'infobase', 'instance', 'issued',
       'known', 'length', 'letter', 'life', 'limit', 'marital', 'match',
       'mean', 'minute', 'minutes', 'models', 'month', 'monthly',
       'months', 'new', 'number', 'observation', 'off', 'outbound',
       'overage', 'owner', 'peak', 'percentage', 'phone', 'placed',
       'premier', 'previous', 'p

### Optics grouping

In [65]:
optics = OPTICS(min_samples=4)
col_descriptions['optics_class'] = optics.fit_predict(tfidfMatrix.toarray())

In [66]:
col_descriptions.optics_class.value_counts()

-1    60
 3    14
 0     8
 1     8
 4     5
 2     5
Name: optics_class, dtype: int64

In [None]:
for groupId in np.unique(optics.labels_):
    gensim.models.ldamodel.LdaModel(
        (pd.Series(corpus)[col_descriptions['optics_class'].reset_index(drop=True)==0]).to_list(),
        num_topics = 3, id2word=dictionary, passes=12
    )

In [70]:
lda = gensim.models.ldamodel.LdaModel(
    (pd.Series(corpus)[col_descriptions['optics_class'].reset_index(drop=True)==0]).to_list(),
    num_topics = 1, id2word=dictionary, passes=12)

In [71]:
re.findall("(\".+?\")",lda.print_topics()[0][1])

['"custom"',
 '"life"',
 '"total"',
 '"number"',
 '"call"',
 '"bill"',
 '"monthli"',
 '"adjust"',
 '"us"',
 '"averag"']

In [72]:
col_descriptions[col_descriptions.optics_class==0]

Unnamed: 0,col_descriptions,clean_desc,optics_class,clean_desc_stem
totcalls,Total number of calls over the life of the cus...,Total number calls life customer,0,total number call life custom
totmou,Total minutes of use over the life of the cust...,Total minutes use life customer,0,total minut us life custom
adjrev,Billing adjusted total revenue over the life o...,Billing adjusted total revenue life customer,0,bill adjust total revenu life custom
adjmou,Billing adjusted total minutes of use over the...,Billing adjusted total minutes use life customer,0,bill adjust total minut us life custom
adjqty,Billing adjusted total number of calls over th...,Billing adjusted total number calls life customer,0,bill adjust total number call life custom
avgrev,Average monthly revenue over the life of the c...,Average monthly revenue life customer,0,averag monthli revenu life custom
avgmou,Average monthly minutes of use over the life o...,Average monthly minutes use life customer,0,averag monthli minut us life custom
avgqty,Average monthly number of calls over the life ...,Average monthly number calls life customer,0,averag monthli number call life custom


### kmeans grouping

In [172]:
kmeans = KMeans(n_clusters=10)
col_descriptions['kmeans_class'] = kmeans.fit_predict(X.toarray())

In [173]:
col_descriptions['kmeans_class'].value_counts()

4    48
0    11
1    10
3     8
9     6
8     5
5     5
6     3
7     2
2     2
Name: kmeans_class, dtype: int64

In [174]:
ls = []
for group in col_descriptions['kmeans_class'].unique():
    ls.append(col_descriptions[col_descriptions['kmeans_class']==group]['col_descriptions'].to_numpy())

In [211]:
col_descriptions[col_descriptions['kmeans_class']==0]['col_descriptions'].to_numpy()

array(['Mean number of monthly minutes of use',
       'Mean overage minutes of use',
       'Mean rounded minutes of use of customer care calls',
       'Mean unrounded minutes of use of customer care (see CUSTCARE_MEAN) calls',
       'Mean unrounded minutes of use of completed voice calls',
       'Mean unrounded minutes of use of completed data calls',
       'Mean unrounded minutes of use of received voice calls ',
       'Mean unrounded minutes of use of peak voice calls',
       'Mean unrounded minutes of use of peak data calls',
       'Mean unrounded minutes of use of off-peak voice calls',
       'Mean unrounded minutes of use of off-peak data calls'],
      dtype=object)