## Import Library dan Dataset

In [120]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data = pd.read_csv('new_train.csv')

### membagi dataset menjadi training set dan testing set dengan proporsi 70:30

In [5]:
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

print ("______ TRAIN TEST SPLIT ______")
print ("______________________________")
print ("| Dimensi x_train |", x_train.shape, "|")
print ("| Dimensi x_test  |", x_test.shape, "|")
print ("| Dimensi y_train |", y_train.shape, "  |")
print ("| Dimensi y_test  |", y_test.shape, "  |")
print ("______________________________")

______ TRAIN TEST SPLIT ______
______________________________
| Dimensi x_train | (23065, 15) |
| Dimensi x_test  | (9885, 15) |
| Dimensi y_train | (23065,)   |
| Dimensi y_test  | (9885,)   |
______________________________


### Melakukan normalisasi data pada salah satu attribute menggunakan Min Max scaler 

In [6]:
from sklearn.preprocessing import MinMaxScaler

data_copy_min_max_scaler = data.copy()
attribute_to_normalize = 'duration'
scaler = MinMaxScaler()
data_copy_min_max_scaler[[attribute_to_normalize]] = scaler.fit_transform(data_copy_min_max_scaler[[attribute_to_normalize]])
print(data_copy_min_max_scaler)

       age           job   marital            education  default housing loan  \
0       49   blue-collar   married             basic.9y  unknown      no   no   
1       37  entrepreneur   married    university.degree       no      no   no   
2       78       retired   married             basic.4y       no      no   no   
3       36        admin.   married    university.degree       no     yes   no   
4       59       retired  divorced    university.degree       no      no   no   
...    ...           ...       ...                  ...      ...     ...  ...   
32945   28      services    single          high.school       no     yes   no   
32946   52    technician   married  professional.course       no     yes   no   
32947   54        admin.   married             basic.9y       no      no  yes   
32948   29        admin.   married    university.degree       no      no   no   
32949   35        admin.   married    university.degree       no      no  yes   

         contact month day_

### Melakukan standarisasi pada dataset

In [7]:
from sklearn.preprocessing import StandardScaler
data_copy_standard = data.copy()
standard_scaler = StandardScaler()
attribute_to_standardize = 'age'
data_copy_standard[[attribute_to_standardize]] = standard_scaler.fit_transform(data_copy_standard[[attribute_to_standardize]])
print(data_copy_standard)

            age           job   marital            education  default housing  \
0      0.863739   blue-collar   married             basic.9y  unknown      no   
1     -0.289722  entrepreneur   married    university.degree       no      no   
2      3.651268       retired   married             basic.4y       no      no   
3     -0.385843        admin.   married    university.degree       no     yes   
4      1.824956       retired  divorced    university.degree       no      no   
...         ...           ...       ...                  ...      ...     ...   
32945 -1.154817      services    single          high.school       no     yes   
32946  1.152104    technician   married  professional.course       no     yes   
32947  1.344347        admin.   married             basic.9y       no      no   
32948 -1.058695        admin.   married    university.degree       no      no   
32949 -0.481965        admin.   married    university.degree       no      no   

      loan    contact month

### Lakukan Data cleaning pada data dengan nilai null

In [36]:
# cek data null
data.isna().sum()

age            0
job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64

In [37]:
# disini saya akan membuat nilai null/nan pada kolom campaign
print("Nilai dalam kolom 'campaign':")
print(data['campaign'].unique())

Nilai dalam kolom 'campaign':
[ 4  2  1  5  9  3  7  6 13  8 12 10 19 11 31 17 16 29 43 20 14 21 35 15
 33 28 22 25 18 23 27 26 24 34 32 37 30 42 40 56]


In [38]:
#mengubah nilai null/nan pada kolom campaign
data.loc[data['campaign'] < 5, 'campaign'] = float('nan')
print("Nilai dalam kolom 'campaign' setelah saya ubah:")
print(data['campaign'].unique())

Nilai dalam kolom 'campaign' setelah saya ubah:
[nan  5.  9.  7.  6. 13.  8. 12. 10. 19. 11. 31. 17. 16. 29. 43. 20. 14.
 21. 35. 15. 33. 28. 22. 25. 18. 23. 27. 26. 24. 34. 32. 37. 30. 42. 40.
 56.]


In [39]:
# cek nilai null/nan pada data 
data.isna().sum()

age                0
job                0
marital            0
education          0
default            0
housing            0
loan               0
contact            0
month              0
day_of_week        0
duration           0
campaign       29006
pdays              0
previous           0
poutcome           0
y                  0
dtype: int64

In [41]:
# lakukan proses cleaning pada kolom yang ada nilai null
strategies = {
    'campaign': data['campaign'].mean()
}

data_cleaned = data.fillna(value=strategies)
print("\nDataset setelah data cleaning")
print("Jumlah Record Yang memiliki nilai null: ")
print(data_cleaned.isnull().sum())


Dataset setelah data cleaning
Jumlah Record Yang memiliki nilai null: 
age            0
job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64


### Lakukan Data cleaning pada data dengan nilai duplikat.

In [44]:
#Cek data duplikat
print("Cek data yang memiliki nilai duplicated")
data.duplicated().sum()

Cek data yang memiliki nilai duplicated


12

In [45]:
# lakukan data cleaning pada nilai duplicated
data_cleaned.drop_duplicates(inplace=True)
data_cleaned.duplicated().sum()
print("\n Cek Data Setelah menggunakan fungsi drop_duplicates()")
print("Jumlah Nilai Duplikat:", data_cleaned.duplicated().sum())


 Cek Data Setelah menggunakan fungsi drop_duplicates()
Jumlah Nilai Duplikat: 0


### Ganti tipe data salah satu attribute angka

In [96]:
print ("Tipe Data Pada Kolom age Sebelum di Ganti : ")
data['age'].dtypes

Tipe Data Pada Kolom age Sebelum di Ganti : 


dtype('int64')

In [97]:
data['age'] = data['age'].astype(float)
print ("Tipe Data Pada Kolom age Sebelum di Ganti : ")
data['age'].dtypes

Tipe Data Pada Kolom age Sebelum di Ganti : 


dtype('float64')

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          32950 non-null  float64
 1   job          32950 non-null  object 
 2   marital      32950 non-null  object 
 3   education    32950 non-null  object 
 4   default      32950 non-null  object 
 5   housing      32950 non-null  object 
 6   loan         32950 non-null  object 
 7   contact      32950 non-null  object 
 8   month        32950 non-null  object 
 9   day_of_week  32950 non-null  object 
 10  duration     32950 non-null  int64  
 11  campaign     3944 non-null   float64
 12  pdays        32950 non-null  int64  
 13  previous     32950 non-null  int64  
 14  poutcome     32950 non-null  object 
 15  y            32950 non-null  object 
dtypes: float64(2), int64(3), object(11)
memory usage: 4.0+ MB


### Lakukan one hot encoding pada dataset

In [105]:
data[['marital','job','housing']]

Unnamed: 0,marital,job,housing
0,married,blue-collar,no
1,married,entrepreneur,no
2,married,retired,no
3,married,admin.,yes
4,divorced,retired,no
...,...,...,...
32945,single,services,yes
32946,married,technician,yes
32947,married,admin.,no
32948,married,admin.,no


In [101]:
# import fungsi One Hot Encoder dari modul sklearn
from sklearn.preprocessing import OneHotEncoder

In [114]:
ohe = OneHotEncoder(sparse_output=False)
oh_encoded = ohe.fit_transform(data[['marital', 'housing']])

In [115]:
encoded = pd.DataFrame(oh_encoded,
                      columns=['divorced','married','single','unknown','no','yes','unknown'])
encoded

Unnamed: 0,divorced,married,single,unknown,no,yes,unknown.1
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
32945,0.0,0.0,1.0,0.0,0.0,0.0,1.0
32946,0.0,1.0,0.0,0.0,0.0,0.0,1.0
32947,0.0,1.0,0.0,0.0,1.0,0.0,0.0
32948,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [119]:
#jika dilihat di head maka kolom akan bertambah
data = pd.concat([data, encoded], axis=1)
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,no,yes,unknown,divorced,married,single,unknown.1,no.1,yes.1,unknown.2
0,49.0,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,37.0,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,78.0,retired,married,basic.4y,no,no,no,cellular,jul,mon,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,36.0,admin.,married,university.degree,no,yes,no,telephone,may,mon,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,59.0,retired,divorced,university.degree,no,no,no,cellular,jun,tue,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
