In [1]:
# Import Modules
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

## Loading Telco Churn CSV raw data 

In [2]:
#Loading data obtained from Kaggle
customer_churn_df = pd.read_csv(Path("Resources/WA_Fn-UseC_-Telco-Customer-Churn.csv"))

#printing the first 5 rows
customer_churn_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Split target column from dataset
y = customer_churn_df['Churn']
X = customer_churn_df.drop(columns='Churn')

# Set Index
X = X.set_index('customerID')

In [4]:
# Print first 5 entries for target
y[:5]

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

In [5]:
customer_churn_df = customer_churn_df.replace({'Partner': {'Yes': 1, 'No': 0}})
customer_churn_df = customer_churn_df.replace({'Dependents': {'Yes': 1, 'No': 0}})
customer_churn_df = customer_churn_df.replace({'PhoneService': {'Yes': 1, 'No': 0}})
customer_churn_df = customer_churn_df.replace({'MultipleLines': {'Yes': 1, 'No': 0, "No phone service" : 0}})
customer_churn_df = customer_churn_df.replace({'OnlineSecurity': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'OnlineBackup': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'DeviceProtection': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'TechSupport': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'StreamingTV': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'StreamingMovies': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'PaperlessBilling': {'Yes': 1, 'No': 0, "No internet service" : 0}})
customer_churn_df = customer_churn_df.replace({'Churn': {'Yes': 1, 'No': 0, "No internet service" : 0}})


customer_churn_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,0,DSL,0,...,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,0,DSL,1,...,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,0,DSL,1,...,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,0,DSL,1,...,1,1,0,0,One year,0,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,0,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,1,1,24,1,1,DSL,1,...,1,1,1,1,One year,1,Mailed check,84.80,1990.5,0
7039,2234-XADUH,Female,0,1,1,72,1,1,Fiber optic,0,...,1,0,1,1,One year,1,Credit card (automatic),103.20,7362.9,0
7040,4801-JZAZL,Female,0,1,1,11,0,0,DSL,1,...,0,0,0,0,Month-to-month,1,Electronic check,29.60,346.45,0
7041,8361-LTMKD,Male,1,1,0,4,1,1,Fiber optic,0,...,0,0,0,0,Month-to-month,1,Mailed check,74.40,306.6,1


In [6]:
customer_churn_df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
dtype: object

In [7]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

X.head()

Unnamed: 0_level_0,SeniorCitizen,tenure,MonthlyCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7590-VHVEG,0,1,29.85,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
5575-GNVDE,0,34,56.95,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3668-QPYBK,0,2,53.85,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7795-CFOCW,0,45,42.3,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9237-HQITU,0,2,70.7,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)



In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

In [10]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [11]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [12]:
tree_predictions = model.predict(X_test_scaled)

In [13]:
print(classification_report(y_test, tree_predictions))

              precision    recall  f1-score   support

          No       0.85      0.87      0.86      1327
         Yes       0.56      0.53      0.54       434

    accuracy                           0.78      1761
   macro avg       0.71      0.70      0.70      1761
weighted avg       0.78      0.78      0.78      1761



In [14]:
# Create DOT data
dot_data = tree.export_graphviz(
    model, out_file=None, feature_names=X.columns, class_names=["0", "1"], filled=True)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

InvocationException: GraphViz's executables not found

## DF adjustments from HG

In [None]:
#narrow down the amount of columns (Do we want to keep this?)
new_df=customer_churn_df[["customerID","MonthlyCharges", "TotalCharges", "Churn"]]
new_df

In [None]:
# use get dummies to transform categorical data to binary
new_df=pd.get_dummies(new_df,columns=["Churn"],dtype=float).copy()
new_df.head()

In [None]:
dummies_df_1 = pd.get_dummies(customer_churn_df["Dependents"])
dummies_df_1.head()

In [None]:
customer_churn_df_dummies = pd.get_dummies(customer_churn_df["PhoneService"])
customer_churn_df_dummies.head()

In [None]:
customer_churn_df_dummies

In [None]:
#standardize data for numerical values
from sklearn.preprocessing import StandardScaler

In [None]:
X=new_df[["MonthlyCharges", "TotalCharges"]]
X.head()

In [None]:
y=new_df["Churn"]

In [None]:
x_scaled = StandardScaler().fit_transform(new_df[["MonthlyCharges", "TotalCharges"]])

In [None]:
#run model, fit to model and backtest

In [None]:
customer_churn_df.dtypes

In [None]:
X=customer_churn_df_scaled[["MonthlyCharges", "TotalCharges"]]
X.head()

In [None]:
customer_churn_prediction = pd.concat([new_df, ], axis=1)