In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [21]:
# read csv file for data set

df = pd.read_csv("../Resources/telecom_customer_churn.csv")
df.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [22]:
# removed joined from customer category
df = df[df['Customer Status'] != 'Joined']

In [23]:
# Fill in default values from nan values

df['Churn Category']                    = df['Churn Category'].replace(np.nan,'satisfied')
df['Internet Type']                     = df['Internet Type'].replace(np.nan,'No service')
df['Multiple Lines']                    = df['Multiple Lines'].replace(np.nan,'NO phone Service')
df['Avg Monthly Long Distance Charges'] = df['Avg Monthly Long Distance Charges'].replace(np.nan,0)
df['Avg Monthly GB Download']           = df['Avg Monthly GB Download'].replace(np.nan,0)
df['Multiple Lines']                    = df['Multiple Lines'].replace(np.nan,'No service')
df['Internet Service']                  = df['Internet Service'].replace(np.nan,'No service')
df['Online Security']                   = df['Online Security'].replace(np.nan,'No service')
df['Online Backup']                     = df['Online Backup'].replace(np.nan,'No service')
df['Device Protection Plan']            = df['Device Protection Plan'].replace(np.nan,'No service')
df['Premium Tech Support']              = df['Premium Tech Support'].replace(np.nan,'No service')
df['Streaming TV']                      = df['Streaming TV'].replace(np.nan,'No service')
df['Streaming Music']                   = df['Streaming Music'].replace(np.nan,'No service')
df['Unlimited Data']                    = df['Unlimited Data'].replace(np.nan,'No service')
df['Streaming Movies']                  = df['Streaming Movies'].replace(np.nan,'No service')

In [24]:
print(df.columns)
df.info()

Index(['Customer ID', 'Gender', 'Age', 'Married', 'Number of Dependents',
       'City', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals',
       'Tenure in Months', 'Offer', 'Phone Service',
       'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Customer Status', 'Churn Category', 'Churn Reason'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6589 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                           

In [25]:
# drop the columns for to desired data set based on discovery and group discussion
df = df.drop(['Customer ID','Churn Reason','Total Refunds','Zip Code','Latitude','Longitude',
            'Paperless Billing','Total Revenue','Total Charges','City','Number of Referrals',
            'Tenure in Months',],axis=1)
print(df.columns)

Index(['Gender', 'Age', 'Married', 'Number of Dependents', 'Offer',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Payment Method',
       'Monthly Charge', 'Total Extra Data Charges',
       'Total Long Distance Charges', 'Customer Status', 'Churn Category'],
      dtype='object')


In [26]:
# drop additional columns for logistic regression model
# lr_df = df.drop(['Avg Monthly Long Distance Charges','Multiple Lines','Device Protection Plan',
#                 'Premium Tech Support','Online Security','Monthly Charge','Total Charges',
#                 'Total Long Distance Charges','Churn Category','Internet Type'],axis=1)

# lr_df.head()


In [27]:
# Select columns for X_train and dummies for yes no catagorical data
lr_df = df[['Avg Monthly Long Distance Charges', 'Offer','Phone Service', 'Monthly Charge',
                'Customer Status','Total Extra Data Charges']]



In [28]:
from sklearn.model_selection import train_test_split

X = pd.get_dummies(data=lr_df,columns=['Offer','Phone Service','Customer Status'
                                            ], prefix="new",drop_first = True)

y = df["Customer Status"]
print(X.shape, y.shape)
y.dtype

print(X)



(6589, 10) (6589,)
      Avg Monthly Long Distance Charges  Monthly Charge  \
0                                 42.39           65.60   
1                                 10.69           -4.00   
2                                 33.65           73.90   
3                                 27.82           98.00   
4                                  7.38           83.90   
...                                 ...             ...   
7037                               2.01           20.95   
7038                              46.68           55.15   
7039                              16.20           85.10   
7041                               2.12           67.85   
7042                               0.00           59.00   

      Total Extra Data Charges  new_Offer A  new_Offer B  new_Offer C  \
0                            0            0            0            0   
1                           10            0            0            0   
2                            0            0          

https://www.nickmccullum.com/python-machine-learning/logistic-regression-python/#adding-dummy-variables-to-the-pandas-dataframe

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print (X_train)

      Avg Monthly Long Distance Charges  Monthly Charge  \
4541                              21.03           55.30   
4976                              22.83           85.45   
3547                              41.16           44.75   
6646                              32.08           44.70   
516                               31.31           91.25   
...                                 ...             ...   
4037                              21.64           99.65   
5552                              41.37           75.55   
5589                              30.31           19.55   
5760                               5.60           63.95   
909                               41.74           98.40   

      Total Extra Data Charges  new_Offer A  new_Offer B  new_Offer C  \
4541                         0            0            0            0   
4976                         0            1            0            0   
3547                        10            0            0            0   

In [30]:
from sklearn.preprocessing import StandardScaler,LabelEncoder, MinMaxScaler

# zcale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [31]:
# Train the model

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [32]:
predictions = model.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")


First 10 Predictions:   ['Churned' 'Stayed' 'Stayed' 'Stayed' 'Stayed' 'Stayed' 'Churned' 'Stayed'
 'Stayed' 'Stayed']
First 10 Actual labels: ['Churned', 'Stayed', 'Stayed', 'Stayed', 'Stayed', 'Stayed', 'Churned', 'Stayed', 'Stayed', 'Stayed']


In [33]:
# create dataframe from predictions
predictions = model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,Churned,Churned
1,Stayed,Stayed
2,Stayed,Stayed
3,Stayed,Stayed
4,Stayed,Stayed
...,...,...
1643,Stayed,Stayed
1644,Stayed,Stayed
1645,Stayed,Stayed
1646,Churned,Churned


https://towardsdatascience.com/tuning-the-hyperparameters-of-your-machine-learning-model-using-gridsearchcv-7fc2bb76ff27

In [34]:
# # Hyperparameter Tuning by Creating the GridSearchCV model - here for reference as score is 1
from sklearn.model_selection import GridSearchCV

param_grid = {'C': np.logspace(-3,3,7),
              'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
            'max_iter':[1000],}

grid = GridSearchCV(model, param_grid, verbose=3)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 21 candidates, totalling 105 fits
[CV 1/5] END C=0.001, max_iter=1000, solver=newton-cg;, score=0.998 total time=   0.0s
[CV 2/5] END C=0.001, max_iter=1000, solver=newton-cg;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.001, max_iter=1000, solver=newton-cg;, score=1.000 total time=   0.0s
[CV 4/5] END C=0.001, max_iter=1000, solver=newton-cg;, score=0.999 total time=   0.0s
[CV 5/5] END C=0.001, max_iter=1000, solver=newton-cg;, score=1.000 total time=   0.0s
[CV 1/5] END C=0.001, max_iter=1000, solver=lbfgs;, score=0.998 total time=   0.0s
[CV 2/5] END C=0.001, max_iter=1000, solver=lbfgs;, score=1.000 total time=   0.0s
[CV 3/5] END C=0.001, max_iter=1000, solver=lbfgs;, score=1.000 total time=   0.0s
[CV 4/5] END C=0.001, max_iter=1000, solver=lbfgs;, score=0.999 total time=   0.0s
[CV 5/5] END C=0.001, max_iter=1000, solver=lbfgs;, score=1.000 total time=   0.0s
[CV 1/5] END C=0.001, max_iter=1000, solver=liblinear;, score=1.000 total time=   0.0s
[

In [35]:
# create classifcation report
# from sklearn.metrics import classification_report
# classification_report(y_test, predictions)

In [36]:
# save model

import joblib
filename = 'LogisticRegression.sav'
joblib.dump(model, filename)

['LogisticRegression.sav']