In [38]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib as plt
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import silhouette_score

In [39]:
df = pd.read_csv("../Resources/telecom_customer_churn.csv")
df.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [40]:
df = df[df['Customer Status'] != 'Joined']


In [41]:
df = df.drop(['Customer ID','Churn Category','Churn Reason','Zip Code','Latitude','Longitude','Paperless Billing','City'],axis=1)
df.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Unlimited Data,Contract,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,Female,37,Yes,0,2,9,,Yes,42.39,No,...,Yes,One Year,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed
1,Male,46,No,0,0,9,,Yes,10.69,Yes,...,No,Month-to-Month,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed
2,Male,50,No,0,0,4,Offer E,Yes,33.65,No,...,Yes,Month-to-Month,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned
3,Male,78,Yes,0,1,13,Offer D,Yes,27.82,No,...,Yes,Month-to-Month,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned
4,Female,75,Yes,0,3,3,,Yes,7.38,No,...,Yes,Month-to-Month,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned


In [42]:
mylist = list(df.select_dtypes(include=['object']).columns)
print(mylist)

for column in mylist:
    df[column] = df[column].astype('category')

['Gender', 'Married', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Payment Method', 'Customer Status']


In [43]:
categorical_cols = list(df.select_dtypes(include=['category']).columns)

from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(df[categorical_cols])

#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=df.index)

#Extract only the columns that didnt need to be encoded
data_other_cols = df.drop(columns=categorical_cols)

#Concatenate the two dataframes : 
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [44]:
df.dtypes

Gender                                 int32
Age                                    int64
Married                                int32
Number of Dependents                   int64
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                  int32
Phone Service                          int32
Avg Monthly Long Distance Charges    float64
Multiple Lines                         int32
Internet Service                       int32
Internet Type                          int32
Avg Monthly GB Download              float64
Online Security                        int32
Online Backup                          int32
Device Protection Plan                 int32
Premium Tech Support                   int32
Streaming TV                           int32
Streaming Movies                       int32
Streaming Music                        int32
Unlimited Data                         int32
Contract                               int32
Payment Me

In [45]:
df = df.reset_index(drop=True)

In [46]:
df.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Unlimited Data,Contract,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,0,37,1,0,2,9,0,1,42.39,0,...,1,1,1,65.6,593.3,0.0,0,381.51,974.81,1
1,1,46,0,0,0,9,0,1,10.69,1,...,0,0,1,-4.0,542.4,38.33,10,96.21,610.28,1
2,1,50,0,0,0,4,5,1,33.65,0,...,1,0,0,73.9,280.85,0.0,0,134.6,415.45,0
3,1,78,1,0,1,13,4,1,27.82,0,...,1,0,0,98.0,1237.85,0.0,0,361.66,1599.51,0
4,0,75,1,0,3,3,0,1,7.38,0,...,1,0,1,83.9,267.4,0.0,0,22.14,289.54,0


In [47]:
raw_data = df.drop(['Customer Status'],axis=1) #selector column
print(raw_data.head(5))

   Gender  Age  Married  Number of Dependents  Number of Referrals  \
0       0   37        1                     0                    2   
1       1   46        0                     0                    0   
2       1   50        0                     0                    0   
3       1   78        1                     0                    1   
4       0   75        1                     0                    3   

   Tenure in Months  Offer  Phone Service  Avg Monthly Long Distance Charges  \
0                 9      0              1                              42.39   
1                 9      0              1                              10.69   
2                 4      5              1                              33.65   
3                13      4              1                              27.82   
4                 3      0              1                               7.38   

   Multiple Lines  ...  Streaming Music  Unlimited Data  Contract  \
0               0  ...       

In [51]:
raw_data = raw_data.dropna( axis=0, how='any', thresh=None, subset=None, inplace=False)

In [52]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4601 entries, 0 to 6587
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             4601 non-null   int32  
 1   Age                                4601 non-null   int64  
 2   Married                            4601 non-null   int32  
 3   Number of Dependents               4601 non-null   int64  
 4   Number of Referrals                4601 non-null   int64  
 5   Tenure in Months                   4601 non-null   int64  
 6   Offer                              4601 non-null   int32  
 7   Phone Service                      4601 non-null   int32  
 8   Avg Monthly Long Distance Charges  4601 non-null   float64
 9   Multiple Lines                     4601 non-null   int32  
 10  Internet Service                   4601 non-null   int32  
 11  Internet Type                      4601 non-null   int32

In [53]:
norm_data = normalize(raw_data, norm="l1", axis=0)
norm_data = pd.DataFrame(columns=raw_data.columns, data=norm_data)

scaler = MinMaxScaler()
m_norm_data = normalize(raw_data, norm="max", axis=0)
m_norm_data = pd.DataFrame(columns=raw_data.columns, data=m_norm_data)  # convert back to a dataframe
#norm_data["drinks"] = m_norm_data["drinks"]

In [54]:
norm_data = pd.DataFrame(columns=raw_data.columns, data=norm_data)

print(norm_data.head(5))
print(norm_data.info())

     Gender       Age   Married  Number of Dependents  Number of Referrals  \
0  0.000000  0.000168  0.000433                   0.0             0.000223   
1  0.000435  0.000209  0.000000                   0.0             0.000000   
2  0.000435  0.000227  0.000000                   0.0             0.000000   
3  0.000435  0.000354  0.000433                   0.0             0.000112   
4  0.000000  0.000340  0.000433                   0.0             0.000335   

   Tenure in Months     Offer  Phone Service  \
0          0.000056  0.000000       0.000217   
1          0.000056  0.000000       0.000217   
2          0.000025  0.000804       0.000217   
3          0.000082  0.000643       0.000217   
4          0.000019  0.000000       0.000217   

   Avg Monthly Long Distance Charges  Multiple Lines  ...  Streaming Music  \
0                           0.000360        0.000000  ...         0.000000   
1                           0.000091        0.000386  ...         0.000463   
2       

In [55]:
corr_matrix = norm_data.corr()
print(corr_matrix)

                                     Gender       Age   Married  \
Gender                             1.000000  0.008184  0.035431   
Age                                0.008184  1.000000 -0.002949   
Married                            0.035431 -0.002949  1.000000   
Number of Dependents               0.003985 -0.123737  0.295306   
Number of Referrals                0.014248 -0.027765  0.655283   
Tenure in Months                   0.013006 -0.021279  0.380866   
Offer                              0.003454 -0.030862 -0.128195   
Phone Service                           NaN       NaN       NaN   
Avg Monthly Long Distance Charges  0.016399 -0.020600 -0.008848   
Multiple Lines                     0.001179  0.081622  0.133245   
Internet Service                        NaN       NaN       NaN   
Internet Type                      0.000170  0.138604 -0.031885   
Avg Monthly GB Download           -0.017274 -0.570145  0.070406   
Online Security                   -0.020685 -0.092420  0.17060

In [56]:
RAND_STATE=50  # for reproducibility and consistency
folds=3
k_fold = KFold(n_splits=folds, shuffle=True, random_state=RAND_STATE)  # setting generator for k-fold splitting

In [57]:
# Dictionary of hyperparameters to iterate through
# GridSearchCV will try every combination of these hyperparameters and
# return the model with the best score via KFold validation
hyperparams = {
    "n_clusters": [2, 3],
    "n_init": [10, 15, 20],
    "max_iter": [100, 200, 300, 400, 500],
    "tol": [.0000001, .000001, .00001, .0001],
}

k_means = KMeans()  # sets jobs equal to number of cores

ensemble = GridSearchCV(
    estimator=k_means,
    param_grid=hyperparams,
    cv=k_fold,
    n_jobs=-1
)

In [58]:
ensemble.fit(norm_data)

GridSearchCV(cv=KFold(n_splits=3, random_state=50, shuffle=True),
             estimator=KMeans(), n_jobs=-1,
             param_grid={'max_iter': [100, 200, 300, 400, 500],
                         'n_clusters': [2, 3], 'n_init': [10, 15, 20],
                         'tol': [1e-07, 1e-06, 1e-05, 0.0001]})

In [66]:
GridSearchCV(cv=KFold(n_splits=3, random_state=50, shuffle=True),
       error_score='raise',
       estimator=KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, 
    random_state=None, tol=0.0001, verbose=0),
       n_jobs=-1,
       param_grid={'max_iter': [100, 200, 300, 400, 500], 'n_init': [10, 15, 20], 'tol': [1e-07, 1e-06, 1e-05, 0.0001], 'n_clusters': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

GridSearchCV(cv=KFold(n_splits=3, random_state=50, shuffle=True),
             error_score='raise', estimator=KMeans(), n_jobs=-1,
             param_grid={'max_iter': [100, 200, 300, 400, 500],
                         'n_clusters': [2, 3], 'n_init': [10, 15, 20],
                         'tol': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [67]:
# Generate labels for data with model with raw data, compute score
labels = ensemble.predict(norm_data)
score = silhouette_score(norm_data, labels)
print(score)
print(ensemble.best_params_)

0.45378809510274337
{'max_iter': 200, 'n_clusters': 3, 'n_init': 10, 'tol': 1e-06}


In [68]:
# hyperparameters to try out for binary classification with KNN
bin_params = {
    "n_init": [10, 15, 20],
    "max_iter": [100, 200, 300, 400, 500],
    "tol": [.0000001, .000001, .00001, .0001],
}

bin_k_means = KMeans(n_clusters=2)  # set 2 clusters

binary_ensemble = GridSearchCV(
    estimator=bin_k_means,
    param_grid=bin_params,
    cv=k_fold,
    n_jobs=-1
)

binary_ensemble.fit(norm_data)  # fit model to data and return best model

# Generate labels for data with model with raw data, compute score
bin_labels = binary_ensemble.predict(norm_data)
bin_score = silhouette_score(norm_data, bin_labels)

# Output score to user
print(bin_score)
print(binary_ensemble.best_params_)

0.5229315817525568
{'max_iter': 100, 'n_init': 10, 'tol': 1e-07}
