In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# read csv file for data set

df = pd.read_csv("../Resources/telecom_customer_churn.csv")
df.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [4]:
# removed joined from customer category
df = df[df['Customer Status'] != 'Joined']

In [5]:
# Fill in default values from nan values

df['Churn Category']                    = df['Churn Category'].replace(np.nan,'satisfied')
df['Internet Type']                     = df['Internet Type'].replace(np.nan,'No')
df['Multiple Lines']                    = df['Multiple Lines'].replace(np.nan,'No')
df['Avg Monthly Long Distance Charges'] = df['Avg Monthly Long Distance Charges'].replace(np.nan,0)
df['Avg Monthly GB Download']           = df['Avg Monthly GB Download'].replace(np.nan,0)
df['Multiple Lines']                    = df['Multiple Lines'].replace(np.nan,'No')
df['Internet Service']                  = df['Internet Service'].replace(np.nan,'No')
df['Online Security']                   = df['Online Security'].replace(np.nan,'No')
df['Online Backup']                     = df['Online Backup'].replace(np.nan,'No')
df['Device Protection Plan']            = df['Device Protection Plan'].replace(np.nan,'No')
df['Premium Tech Support']              = df['Premium Tech Support'].replace(np.nan,'No')
df['Streaming TV']                      = df['Streaming TV'].replace(np.nan,'No')
df['Streaming Music']                   = df['Streaming Music'].replace(np.nan,'No')
df['Unlimited Data']                    = df['Unlimited Data'].replace(np.nan,'No')
df['Streaming Movies']                  = df['Streaming Movies'].replace(np.nan,'No')

In [6]:
print(df.columns)
df.info()

Index(['Customer ID', 'Gender', 'Age', 'Married', 'Number of Dependents',
       'City', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals',
       'Tenure in Months', 'Offer', 'Phone Service',
       'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Customer Status', 'Churn Category', 'Churn Reason'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6589 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                           

In [7]:
mylist = list(df.select_dtypes(include=['object']).columns)
print(mylist)

for column in mylist:
    df[column] = df[column].astype('category')

['Customer ID', 'Gender', 'Married', 'City', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Customer Status', 'Churn Category', 'Churn Reason']


In [8]:
categorical_cols = list(df.select_dtypes(include=['category']).columns)

from sklearn.preprocessing import LabelEncoder
# set labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(df[categorical_cols])

#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=df.index)

#Extract only the columns that didnt need to be encoded
data_other_cols = df.drop(columns=categorical_cols)

#Concatenate the two dataframes : 
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

df.dtypes

Customer ID                            int32
Gender                                 int32
Age                                    int64
Married                                int32
Number of Dependents                   int64
City                                   int32
Zip Code                               int64
Latitude                             float64
Longitude                            float64
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                  int32
Phone Service                          int32
Avg Monthly Long Distance Charges    float64
Multiple Lines                         int32
Internet Service                       int32
Internet Type                          int32
Avg Monthly GB Download              float64
Online Security                        int32
Online Backup                          int32
Device Protection Plan                 int32
Premium Tech Support                   int32
Streaming 

In [9]:
# create dataframe based on discussion and analysis
lr_df = df[['Offer','Online Security','Premium Tech Support',
            'Tenure in Months',  'Number of Referrals', 'Contract', 'Monthly Charge',
            'Number of Dependents', 'Age', 'Paperless Billing', 'Unlimited Data',
            'Married']]

# check values for HTML form
print(lr_df.max())
# print(lr_df.min())
# print(lr_df['Offer'])


Offer                     5.00
Online Security           1.00
Premium Tech Support      1.00
Tenure in Months         72.00
Number of Referrals      11.00
Contract                  2.00
Monthly Charge          118.75
Number of Dependents      9.00
Age                      80.00
Paperless Billing         1.00
Unlimited Data            1.00
Married                   1.00
dtype: float64


In [10]:
from sklearn.model_selection import train_test_split

X = lr_df
y = df["Customer Status"]
print(X.shape, y.shape)
y.dtype

# print(X)

(6589, 12) (6589,)


dtype('int32')

https://www.nickmccullum.com/python-machine-learning/logistic-regression-python/#adding-dummy-variables-to-the-pandas-dataframe

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print (X_train)

      Offer  Online Security  Premium Tech Support  Tenure in Months  \
4541      0                0                     0                 4   
4976      1                1                     1                71   
3547      0                0                     0                 6   
6646      0                0                     0                 1   
516       2                1                     0                52   
...     ...              ...                   ...               ...   
4037      2                0                     1                41   
5552      0                0                     0                 2   
5589      2                0                     0                53   
5760      0                1                     0                43   
909       0                0                     0                53   

      Number of Referrals  Contract  Monthly Charge  Number of Dependents  \
4541                    0         0           55.30       

In [12]:
from sklearn.preprocessing import StandardScaler,LabelEncoder, MinMaxScaler


# # scale the data
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [13]:
# Train the model

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.844161100991702
Testing Data Score: 0.8440533980582524


In [14]:
predictions = model.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 1 1 1 1 1 0 1 1 1]
First 10 Actual labels: [0, 1, 1, 1, 1, 1, 0, 1, 1, 1]


In [17]:
from sklearn.metrics  import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.74      0.72      0.73       479
           1       0.89      0.89      0.89      1169

    accuracy                           0.84      1648
   macro avg       0.81      0.81      0.81      1648
weighted avg       0.84      0.84      0.84      1648



In [None]:
# create dataframe from predictions
predictions = model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

https://towardsdatascience.com/tuning-the-hyperparameters-of-your-machine-learning-model-using-gridsearchcv-7fc2bb76ff27

In [None]:
# # Hyperparameter Tuning by Creating the GridSearchCV model - here for reference as score is 1
from sklearn.model_selection import GridSearchCV

param_grid = {'C': np.logspace(-3,3,7),
              'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
            'max_iter':[1000],}

grid = GridSearchCV(model, param_grid, verbose=3)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# save model

import joblib
filename = 'LogisticRegression.sav'
joblib.dump(model, filename)


In [None]:
# save scaler

import joblib
filename = 'X_scaler.sav'
joblib.dump(X_scaler, filename)
