In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
churnData = pd.read_csv("DATA_Customer-Churn.csv")

In [3]:
churn_data_df = pd.DataFrame(churnData, columns = ['gender', 'SeniorCitizen', 'Partner',
                                                  'Dependents', 'tenure', 'PhoneService',
                                                   'OnlineSecurity', 'OnlineBackup',
                                                   'DeviceProtection', 'TechSupport',
                                                   'StreamingTV', 'StreamingMovies',
                                                   'Contract', 'MonthlyCharges', 'TotalCharges', 'Churn']
                                                   )

In [4]:
churn_data_df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [5]:
churnData.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [10]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')
### errors = 'ignore' >>> skip respective rows
### errors = 'coerce' >>> fill errors with Null Values; then handle the Null Values

In [11]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [12]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [21]:
churn_data_df = churnData.dropna(axis=0, how='any', inplace=True)

### alternatively fill the Null values with mean:
### churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))

In [22]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [23]:
churnData['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

### Scaling the Features

In [24]:
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']]
y = pd.DataFrame(data=churnData, columns=['Churn'])
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)

### Logistic Regression Model

In [25]:
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.30)
classification = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred_test = classification.predict(X_test)
y_pred_test

  return f(*args, **kwargs)


array(['No', 'No', 'Yes', ..., 'Yes', 'No', 'No'], dtype=object)

Checking the accuracy of the Test Set

In [None]:
### REVIEW THE CHECK FOR ACCURACY ###

In [30]:
classification.score(X_test, y_test) ### R2

0.7981042654028436

In [27]:
accuracy_score =  ### Accuracy = TP+TN/TP+FP+FN+TN
precision_score =  ### Precision = TP/TP+FP
recall_score =    ### Recall = TP/TP+FN

def performance_log(y_test, y_pred_test):
    return pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Test': [accuracy_score(y_test, y_pred_test),
                                        precision_score(y_test, y_pred_test,pos_label="Yes"),
                                        recall_score(y_test, y_pred_test,pos_label="Yes")]})

performance_log(y_test, y_pred_test)

In [34]:
def plot_conf_matrix(y_test, y_pred_test):
    print("Confusion matrix for the test set")
    print(confusion_matrix(y_test, y_pred_test))
    plot_confusion_matrix(classification,X_test,y_test, values_format = 'd')
    plt.show()

plot_conf_matrix(y_test, y_pred_test)

### Handling Imbalance with Upscampling

In [35]:
counts = churnData['Churn'].value_counts()
counts

No     5163
Yes    1869
Name: Churn, dtype: int64

In [36]:
yes = churnData[churnData['Churn']=='Yes'].sample(counts[0], replace=True)
no = churnData[churnData['Churn']=='No']
data = pd.concat([yes,no], axis=0)
data = data.sample(frac=1)

In [37]:
data['Churn'].value_counts()

No     5163
Yes    5163
Name: Churn, dtype: int64

In [38]:
X = data[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']]
y = pd.DataFrame(data['Churn'])
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.33)
classification = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred_test = classification.predict(X_test)

  return f(*args, **kwargs)


### Handling Imbalance with Downsampling

In [39]:
yes = churnData[churnData['Churn']=='Yes']
no = churnData[churnData['Churn']=='No']
no = no.sample(len(yes))
data = pd.concat([yes,no], axis=0)
data = data.sample(frac=1)
data['Churn'].value_counts()

No     1869
Yes    1869
Name: Churn, dtype: int64

In [40]:
X = data[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']]
y = pd.DataFrame(data['Churn'])
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.33)
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred_test = classification.predict(X_test)

  return f(*args, **kwargs)
