In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [53]:
dataset = pd.read_csv('Breast_Cancer.csv')

In [54]:
dataset.head(5)

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3,1,50,Alive


In [55]:
dataset.dtypes

Age                         int64
Race                       object
Marital Status             object
T Stage                    object
N Stage                    object
6th Stage                  object
differentiate              object
Grade                      object
A Stage                    object
Tumor Size                float64
Estrogen Status            object
Progesterone Status        object
Regional Node Examined      int64
Reginol Node Positive       int64
Survival Months             int64
Status                     object
dtype: object

### Bagi dataset menjadi training set dan testing set dengan proporsi 70:30

In [56]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [58]:
print("Dimensi X_train :", X_train.shape)
print("Dimensi X_test :", X_test.shape)
print("Dimensi y_train :", y_train.shape)
print("Dimensi y_test :", y_test.shape)

Dimensi X_train : (2816, 15)
Dimensi X_test : (1208, 15)
Dimensi y_train : (2816,)
Dimensi y_test : (1208,)


### Lakukan normalisasi data pada salah satu attribute menggunakan Min Max scaler 

In [59]:
from sklearn.preprocessing import MinMaxScaler

In [60]:
min_max_scaler = MinMaxScaler()

In [61]:
x_scaled = min_max_scaler.fit_transform(dataset[['Age','Reginol Node Positive','Survival Months']])

In [62]:
data_normalization = pd.DataFrame(x_scaled)

In [63]:
data_normalization.head(5)

Unnamed: 0,0,1,2
0,0.974359,0.0,0.556604
1,0.512821,0.088889,0.575472
2,0.717949,0.133333,0.698113
3,0.717949,0.0,0.783019
4,0.435897,0.0,0.462264


### Lakukan standarisasi pada dataset

In [64]:
print("Nilai Sebelum Standarisasi")
print("Standard Deviasi")
np.std(dataset)

Nilai Sebelum Standarisasi
Standard Deviasi


Age                        8.961394
Tumor Size                21.126081
Regional Node Examined     8.132179
Reginol Node Positive      5.110808
Survival Months           22.918483
dtype: float64

In [65]:
from sklearn.preprocessing import StandardScaler

In [66]:
ss = StandardScaler()

In [67]:
x_skala = ss.fit_transform(dataset[['Age','Reginol Node Positive','Survival Months']])

In [68]:
print("Nilai Setelah Standarisasi")
print("Standard Deviasi")
np.std(x_skala)

Nilai Setelah Standarisasi
Standard Deviasi


1.0

In [69]:
data_standarisasi = pd.DataFrame(x_skala)

In [70]:
data_standarisasi.head(5)

Unnamed: 0,0,1,2
0,1.565446,-0.618548,-0.493288
1,-0.44317,0.164107,-0.406022
2,0.449548,0.555434,0.161205
3,0.449548,-0.618548,0.553902
4,-0.777939,-0.618548,-0.929617


### Lakukan Data cleaning pada data dengan nilai null

### 1. Mengecek Data Null

In [71]:
dataset.isna().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                2
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

### Ganti nilai null sesuai ketentuan

In [72]:
from sklearn.impute import SimpleImputer

In [73]:
imputer = SimpleImputer(strategy = "mean")

In [74]:
dataset["Tumor Size"] = imputer.fit_transform(dataset[["Tumor Size"]])

In [75]:
dataset.fillna(dataset.mean())

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3,1,50,Alive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4019,62,Other,Married,T1,N1,IIA,Moderately differentiated,2,Regional,9.0,Positive,Positive,1,1,49,Alive
4020,56,White,Divorced,T2,N2,IIIA,Moderately differentiated,2,Regional,46.0,Positive,Positive,14,8,69,Alive
4021,68,White,Married,T2,N1,IIB,Moderately differentiated,2,Regional,22.0,Positive,Negative,11,3,69,Alive
4022,58,Black,Divorced,T2,N1,IIB,Moderately differentiated,2,Regional,44.0,Positive,Positive,11,1,72,Alive


In [76]:
dataset.isna().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

### Lakukan Data cleaning pada data dengan nilai duplikat

### 1. Mengecek Data Duplikat

In [77]:
dataset.duplicated().sum()

1

In [78]:
dataset[dataset.duplicated()]

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
436,60,White,Widowed,T4,N3,IIIC,Poorly differentiated,3,Regional,61.0,Positive,Positive,25,14,86,Alive


In [79]:
dataset.drop_duplicates(inplace = True)

In [80]:
dataset.duplicated().sum()

0

### Ganti tipe data salah satu attribute angka

In [81]:
dataset["Age"]= dataset["Age"].astype(float)

In [82]:
dataset.dtypes

Age                       float64
Race                       object
Marital Status             object
T Stage                    object
N Stage                    object
6th Stage                  object
differentiate              object
Grade                      object
A Stage                    object
Tumor Size                float64
Estrogen Status            object
Progesterone Status        object
Regional Node Examined      int64
Reginol Node Positive       int64
Survival Months             int64
Status                     object
dtype: object

### Lakukan one hot encoding pada dataset 

In [83]:
from sklearn.preprocessing import OneHotEncoder

In [84]:
encoder = OneHotEncoder(sparse=False)

In [85]:
StatusEnc = encoder.fit_transform(dataset[["Status"]])

In [86]:
Status = pd.DataFrame(StatusEnc)

In [87]:
dataset = dataset.join(Status)

In [88]:
dataset.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status,0,1
0,68.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24,1,60,Alive,1.0,0.0
1,50.0,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14,5,62,Alive,1.0,0.0
2,58.0,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14,7,75,Alive,1.0,0.0
3,58.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2,1,84,Alive,1.0,0.0
4,47.0,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3,1,50,Alive,1.0,0.0
