In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from scipy import stats
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Customer-Churn.csv')
data.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No


In [3]:
data[["TotalCharges"]] = data[["TotalCharges"]].apply(pd.to_numeric, errors='coerce')

In [4]:
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].mean())

In [5]:
#converting Churn so regressor can compute
data['Churn'] = data['Churn'].map(lambda x: 1 if x == 'Yes' else 0)
data['Churn'] = data['Churn'].astype(int)

In [6]:
data['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [7]:
X = data[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y = data['Churn']

In [8]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    5174
1    5174
Name: Churn, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.33)
model_sm_re = DecisionTreeRegressor()
model_sm_re.fit(X_train, y_train)
model_sm_re.score(X_test, y_test)

-0.04914318931354189

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.33)
model_sm_cl = DecisionTreeClassifier()
model_sm_cl.fit(X_train, y_train)
model_sm_cl.score(X_test, y_test)

0.7209370424597364

In [11]:
tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(X, y)
y_tl.value_counts()

0    4620
1    1869
Name: Churn, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.33)
model_tl_re = DecisionTreeRegressor()
model_tl_re.fit(X_train, y_train)
model_tl_re.score(X_test, y_test)

-0.20902282418112983

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.33)
model_tl_cl = DecisionTreeClassifier()
model_tl_cl.fit(X_train, y_train)
model_tl_cl.score(X_test, y_test)

0.7436974789915967

Classifier performs better than Regressor each time - applying Tomeklinks, Smote do not significantly change the score.