In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('churn.xlsx')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()

In [5]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [6]:
df['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [7]:
sns.countplot(df.Churn)

ValueError: could not convert string to float: 'No'

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.countplot(x='gender',hue='Churn',data=df)

In [None]:
df['PhoneService'].value_counts()

# Preprocess the dataset

In [None]:
df.drop('customerID',axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df.columns

In [None]:
from pandas.core.dtypes.common import is_numeric_dtype

In [None]:
for col in df.columns:
    if is_numeric_dtype(df[col]):
        continue
    else:
        df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
x = df.drop('Churn',axis=1)

In [None]:
y= df[['Churn']]

In [None]:
x.head()

In [None]:
x.shape

In [None]:
y.head()

In [None]:
df.Churn.value_counts()

In [None]:
not_churned = (5163/(5163+1869))*100
churned = (1869/(5163+1869))*100

In [None]:
print('Not Churned {} percent of total people'.format(not_churned))
print('Churned {} percent of total people'.format(churned))

# Synthetic Minority Oversampling Technique(SMOTE)

In [None]:
!pip install imblearn

In [None]:
from imblearn.combine import SMOTETomek
sm = SMOTETomek(random_state=44)


In [None]:
y.value_counts() #before sampling

In [None]:
X,Y = sm.fit_resample(x,y)

In [None]:
Y.value_counts()

# Near Miss for under sampling

In [None]:
from imblearn.under_sampling import NearMiss

In [None]:
nm = NearMiss()
new_x, new_y  = nm.fit_resample(x,y)

In [None]:
new_y.value_counts()

# Over Sampling

In [None]:
df2.head()

In [None]:
df2.Churn.value_counts()

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
over = RandomOverSampler(random_state=44)

In [None]:
x_new, y_new = over.fit_resample(x,y)

In [None]:
y_new.value_counts()

# Hold out cross validation

In [None]:
from sklearn.model_selection import train_test_split as tts

In [None]:
xtrain,xtest,ytrain,ytest = tts(x_new,y_new,random_state=42)

In [None]:
xtrain.to_csv('mominul.csv')

In [None]:
train,test = tts(df,train_size=.8,random_state=50)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [None]:
dt.fit(xtrain,ytrain)

In [None]:
dt.score(xtest,ytest)

# Without Sampling

In [None]:
x.shape

In [None]:
xtrain1,xtest1,ytrain1,ytest1 = tts(x,y,test_size=.3,random_state=42)

In [None]:
dt.fit(xtrain1,ytrain1)

In [None]:
dt.score(xtest1,ytest1)

# K Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score, cross_validate

In [None]:
kfold = KFold(n_splits=4)

In [None]:
result = cross_val_score(dt,x_new,y_new,cv=kfold)

In [None]:
result

In [None]:
result.mean()

In [None]:
result.max()

In [None]:
result.min()

# Stratified K Fold Cross Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
st = StratifiedKFold(n_splits=4)

In [None]:
score2 = cross_val_score(dt,x_new,y_new,cv=st)

In [None]:
score2

In [None]:
score2.mean()

# Leave one out cross validation

In [None]:
from sklearn.model_selection import LeaveOneOut

In [None]:
le = LeaveOneOut()

In [None]:
result3 = cross_val_score(dt,x_new,y_new,cv=le)

In [None]:
result.mean()