# ---------------------------------------------------------------------------------------------------------------

# Mengimport Library : Pandas, Numpy dan Matplotlib

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Mengimport CSV File

In [2]:
df = pd.read_excel("Customer Churn 1.xlsx")

# Menampilkan Baris Data Teratas 

Disini kelompok kami menampilkan parameter n (yang diisi dengan 1), maka akan seperti dibawah ini :

In [3]:
df.head(1)

Unnamed: 0,id,JenisKelamin,Umur,Hipertensi,PenyakitJantung,Menikah,Tipe_Pekerjaan,TempatTinggal,GulaDarah,bmi,Merokok,Stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1


# Menampilkan Kerangka Data 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               5110 non-null   int64  
 1   JenisKelamin     5110 non-null   object 
 2   Umur             5110 non-null   float64
 3   Hipertensi       5110 non-null   int64  
 4   PenyakitJantung  5110 non-null   int64  
 5   Menikah          5110 non-null   object 
 6   Tipe_Pekerjaan   5110 non-null   object 
 7   TempatTinggal    5110 non-null   object 
 8   GulaDarah        5110 non-null   float64
 9   bmi              4909 non-null   float64
 10  Merokok          5110 non-null   object 
 11  Stroke           5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


# Menampilkan Bagian Kolom Pada Data 

In [5]:
df.columns

Index(['id', 'JenisKelamin', 'Umur', 'Hipertensi', 'PenyakitJantung',
       'Menikah', 'Tipe_Pekerjaan', 'TempatTinggal', 'GulaDarah', 'bmi',
       'Merokok', 'Stroke'],
      dtype='object')

# Menghapus Kolom customerID 

In [6]:
df_train = df.drop(['id'],axis = 1)
df_train

Unnamed: 0,JenisKelamin,Umur,Hipertensi,PenyakitJantung,Menikah,Tipe_Pekerjaan,TempatTinggal,GulaDarah,bmi,Merokok,Stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
df_train['bmi'].fillna((df_train['bmi'].mean()), inplace=True)
df_train

Unnamed: 0,JenisKelamin,Umur,Hipertensi,PenyakitJantung,Menikah,Tipe_Pekerjaan,TempatTinggal,GulaDarah,bmi,Merokok,Stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


# Implementasi Library Numpy

In [8]:
# Produce metrics.
passenger_metrics = df.describe()

# Retain meaningful metrics by excluding categorical values.
numeric_data = ['Umur', 'Hipertensi', 'PenyakitJantung','GulaDarah','bmi','Stroke']
passenger_metrics = passenger_metrics[numeric_data]

# Revise standard deviation value in metrics to one without Bessel's Correction.
passenger_metrics.loc['std'] = df_train[numeric_data].std(ddof=0)
passenger_metrics

Unnamed: 0,Umur,Hipertensi,PenyakitJantung,GulaDarah,bmi,Stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.610434,0.296578,0.226041,45.279129,7.697265,0.215299
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [9]:
def split_by_Stroke(df, col_name):
    grpby_Stroke = df_train.groupby('Stroke')
    Stroke = grpby_Stroke[col_name].get_group(1).dropna()  # dropna() to remove NaN entries.
    nonChurn = grpby_Stroke[col_name].get_group(0).dropna()
    return Stroke, nonStroke

# Melakukan Split Data Berdasarkan Kolom PaymentMethod 

In [10]:
struk = df_train['Stroke']==1
notstruk = df_train['Stroke']==0

In [11]:
df_train[struk].groupby('PenyakitJantung')['Stroke'].count().reset_index()

Unnamed: 0,PenyakitJantung,Stroke
0,0,202
1,1,47


In [12]:
df_train[notstruk].groupby('PenyakitJantung')['Stroke'].count().reset_index()

Unnamed: 0,PenyakitJantung,Stroke
0,0,4632
1,1,229


In [13]:
df_train[struk].groupby('Hipertensi')['Stroke'].count().reset_index()

Unnamed: 0,Hipertensi,Stroke
0,0,183
1,1,66


In [14]:
df_train[notstruk].groupby('Hipertensi')['Stroke'].count().reset_index()

Unnamed: 0,Hipertensi,Stroke
0,0,4429
1,1,432


In [15]:
df_train.groupby('Stroke')['Hipertensi'].count().reset_index()

Unnamed: 0,Stroke,Hipertensi
0,0,4861
1,1,249


# Menampilkan Kolom Atribut

In [16]:
df_train.columns

Index(['JenisKelamin', 'Umur', 'Hipertensi', 'PenyakitJantung', 'Menikah',
       'Tipe_Pekerjaan', 'TempatTinggal', 'GulaDarah', 'bmi', 'Merokok',
       'Stroke'],
      dtype='object')

# Mengkategorikan DataFrame

In [17]:
df_train['JenisKelamin'] = df_train.JenisKelamin.astype('category').cat.codes
df_train['Umur'] = df_train.Umur.astype('category').cat.codes
df_train['Hipertensi'] = df_train.Hipertensi.astype('category').cat.codes
df_train['PenyakitJantung'] = df_train.PenyakitJantung.astype('category').cat.codes
df_train['Menikah'] = df_train.Menikah.astype('category').cat.codes
df_train['Tipe_Pekerjaan'] = df_train.Tipe_Pekerjaan.astype('category').cat.codes
df_train['TempatTinggal'] = df_train.TempatTinggal.astype('category').cat.codes
df_train['GulaDarah'] = df_train.GulaDarah.astype('category').cat.codes
df_train['bmi'] = df_train.bmi.astype('category').cat.codes
df_train['Merokok'] = df_train.Merokok.astype('category').cat.codes

In [18]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   JenisKelamin     5110 non-null   int8 
 1   Umur             5110 non-null   int8 
 2   Hipertensi       5110 non-null   int8 
 3   PenyakitJantung  5110 non-null   int8 
 4   Menikah          5110 non-null   int8 
 5   Tipe_Pekerjaan   5110 non-null   int8 
 6   TempatTinggal    5110 non-null   int8 
 7   GulaDarah        5110 non-null   int16
 8   bmi              5110 non-null   int16
 9   Merokok          5110 non-null   int8 
 10  Stroke           5110 non-null   int64
dtypes: int16(2), int64(1), int8(8)
memory usage: 99.9 KB


In [19]:
y = df_train.Stroke
X = df_train.drop('Stroke',axis =1)

In [20]:
y


0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: Stroke, Length: 5110, dtype: int64

In [21]:
X

Unnamed: 0,JenisKelamin,Umur,Hipertensi,PenyakitJantung,Menikah,Tipe_Pekerjaan,TempatTinggal,GulaDarah,bmi,Merokok
0,1,88,0,1,1,2,1,3850,240,1
1,0,82,0,0,1,3,0,3588,162,2
2,1,101,0,1,1,2,0,2483,199,2
3,0,70,0,0,1,2,1,3385,218,3
4,0,100,1,0,1,3,0,3394,113,2
...,...,...,...,...,...,...,...,...,...,...
5105,0,101,1,0,1,2,1,1360,162,2
5106,0,102,0,0,1,3,1,3030,274,2
5107,0,56,0,0,1,3,0,1314,180,2
5108,1,72,0,0,1,2,0,3363,129,1


# Implementasi Algoritme SVM dan Naive Bayes 

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dt = DecisionTreeClassifier()
svm = SVC()

In [27]:
print('Naive Bayes')
y_pred = cross_val_predict(nb, X, y, cv=10)
conf_mat = confusion_matrix(y_pred, y)
print(conf_mat)
print(classification_report(y, y_pred, digits = 4))


print('DT')
y_pred = cross_val_predict(dt, X, y, cv=10)
conf_mat = confusion_matrix(y_pred, y)
print(conf_mat)
print(classification_report(y, y_pred, digits = 4))

print('KNN')
y_pred = cross_val_predict(knn, X, y, cv=10)
conf_mat = confusion_matrix(y_pred, y)
print(conf_mat)
print(classification_report(y, y_pred, digits = 4))

Naive Bayes
[[4375  160]
 [ 486   89]]
              precision    recall  f1-score   support

           0     0.9647    0.9000    0.9312      4861
           1     0.1548    0.3574    0.2160       249

    accuracy                         0.8736      5110
   macro avg     0.5598    0.6287    0.5736      5110
weighted avg     0.9253    0.8736    0.8964      5110

DT
[[4620  205]
 [ 241   44]]
              precision    recall  f1-score   support

           0     0.9575    0.9504    0.9540      4861
           1     0.1544    0.1767    0.1648       249

    accuracy                         0.9127      5110
   macro avg     0.5559    0.5636    0.5594      5110
weighted avg     0.9184    0.9127    0.9155      5110

KNN
[[4838  248]
 [  23    1]]
              precision    recall  f1-score   support

           0     0.9512    0.9953    0.9728      4861
           1     0.0417    0.0040    0.0073       249

    accuracy                         0.9470      5110
   macro avg     0.4965    0

Dari hasil tersebut kita bisa melihat untuk bagian recall maka Algoritma Naive Bayes lebih baik
begitu pula dalam hal Akurasinya

# Contoh Untuk Prediksi

In [26]:
nb.predict(np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]).reshape(1, -1))

NotFittedError: This GaussianNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Contoh Hasil Data : 

In [None]:
D = df.groupby('PaymentMethod')['Churn'].count().reset_index()
D

In [None]:
A = df[df['Churn']=='Yes'].groupby('PaymentMethod')['Churn'].count().reset_index()
A

In [None]:
B = df[df['Churn']=='No'].groupby('PaymentMethod')['Churn'].count().reset_index()
B

In [None]:
C = A['Churn']/D['Churn']
C

dari data diatas persentase untuk pelanggan yang menggunakan electronic check lebih banyak kemungkinan pergi