In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv('customer_churn.csv')

In [3]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data.shape

(7043, 21)

In [5]:
data=data[['tenure','SeniorCitizen','MonthlyCharges', 'Churn']]

In [6]:
data.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.3,No
4,2,0,70.7,Yes


In [7]:
data = data.drop_duplicates()

In [8]:
data.columns = [val.lower().replace(' ','_') for val in data.columns]
data.columns

Index(['tenure', 'seniorcitizen', 'monthlycharges', 'churn'], dtype='object')

In [9]:
Y = data['churn']
data = data.drop(['churn'], axis=1)

In [10]:
cat=data.select_dtypes(include=np.object)

In [11]:
num=data.select_dtypes(include = np.number)

In [12]:
from sklearn.preprocessing import StandardScaler #normalizer is an alternative to Standardscaler. Norm distributes between 0 and 1, Sscaler distributes between -1 an +1
fitted = StandardScaler().fit(num)
x_normalized = fitted.transform(num)
print(x_normalized.shape)
x_normalized=pd.DataFrame(x_normalized)

(6566, 3)


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_normalized, Y, test_size=0.4, random_state=10)

In [14]:
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [15]:
classification.score(X_test, y_test)

0.7898743814236772

In [16]:
predictions = classification.predict(X_test)
predictions

array(['Yes', 'No', 'No', ..., 'Yes', 'No', 'Yes'], dtype=object)

In [17]:
pd.Series(predictions).value_counts()

No     2112
Yes     515
dtype: int64

In [18]:
y_test.value_counts()

No     1926
Yes     701
Name: churn, dtype: int64

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[1743,  183],
       [ 369,  332]], dtype=int64)

In [20]:
classification.predict_proba(X_test)

array([[0.37046121, 0.62953879],
       [0.75668007, 0.24331993],
       [0.8631001 , 0.1368999 ],
       ...,
       [0.39804211, 0.60195789],
       [0.96255492, 0.03744508],
       [0.25635252, 0.74364748]])

In [21]:
classification.predict_proba(X_test)[::,0]

array([0.37046121, 0.75668007, 0.8631001 , ..., 0.39804211, 0.96255492,
       0.25635252])

In [22]:
from sklearn.preprocessing import label_binarize
y = label_binarize(Y, classes=[0,1,2])
n_classes = 4
y

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [23]:
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data, iris.target

y = label_binarize(y, classes=[0,1,2])
n_classes = 3
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

checking if oversampling improve my model score

In [25]:
from imblearn.over_sampling import SMOTE #oversampling
smote = SMOTE()
X_sm, y_sm = smote.fit_sample(x_normalized, Y)
y_sm.value_counts()

No     4799
Yes    4799
Name: churn, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.4, random_state=10)

In [30]:
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [31]:
classification.score(X_test, y_test)

0.7247395833333333

combining both oversample and tomeklink expected to be better because we would have
both balanced data and well defined and separated groups: but its just better than smote and worse than tomeklinks

In [38]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X_sm, y_sm)
y_tl.value_counts()

Yes    4799
No     4289
Name: churn, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.4, random_state=10)

In [40]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [41]:
classification.score(X_test, y_test)

0.7422992299229924

checking if undersampling improve my model score. 
Tomeklinks more than undersampling, better defines groups by eliminating values at the forefront (ambiguous value?) 

In [34]:
from imblearn.under_sampling import TomekLinks #undersamping
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(x_normalized, Y)
y_tl.value_counts()

No     4293
Yes    1767
Name: churn, dtype: int64

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.4, random_state=10)

In [36]:
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [37]:
classification.score(X_test, y_test)

0.7937293729372937