# Sintetis Data menggunakan CTGAN

## Install library yang dibutuhkan

In [12]:
# Library ctgan digunakan untuk sintetis data
%pip install ctgan



In [13]:
import pandas as pd
from ctgan import CTGAN

## Memeriksa data awal

In [14]:
# Menampilkan 5 baris pertama dari data
data = pd.read_csv('/content/Heart_Disease_Prediction.csv')
data.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,52,1,1,125,212,0,1,168,0,1.0,2,2,3,Absence
1,53,1,1,140,203,1,0,155,1,3.1,0,0,3,Absence
2,70,1,1,145,174,0,1,125,1,2.6,0,0,3,Absence
3,61,1,1,148,203,0,1,161,0,0.0,2,1,3,Absence
4,62,0,1,138,294,1,1,106,0,1.9,1,3,2,Absence


In [15]:
# Menampilkan informasi tentang data termasuk tipe data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      1025 non-null   int64  
 1   Sex                      1025 non-null   int64  
 2   Chest pain type          1025 non-null   int64  
 3   BP                       1025 non-null   int64  
 4   Cholesterol              1025 non-null   int64  
 5   FBS over 120             1025 non-null   int64  
 6   EKG results              1025 non-null   int64  
 7   Max HR                   1025 non-null   int64  
 8   Exercise angina          1025 non-null   int64  
 9   ST depression            1025 non-null   float64
 10  Slope of ST              1025 non-null   int64  
 11  Number of vessels fluro  1025 non-null   int64  
 12  Thallium                 1025 non-null   int64  
 13  Heart Disease            1025 non-null   object 
dtypes: float64(1), int64(12)

## Melakukan encode pada kolom `Heart Disease` karena bertipe String

In [16]:
# Import library untuk encoding
from sklearn.preprocessing import LabelEncoder

# Menggunakan LabelEncoder untuk encode kolom 'Heart Disease'
data_copy = data.copy()
encoder = LabelEncoder()
data_copy['Heart Disease'] = encoder.fit_transform(data_copy['Heart Disease'])

# Menampilkan 5 baris pertama dari data yang sudah diencode
data_copy.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,52,1,1,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,1,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,1,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,1,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,1,138,294,1,1,106,0,1.9,1,3,2,0


In [17]:
# Menampilkan informasi tentang data termasuk tipe data
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      1025 non-null   int64  
 1   Sex                      1025 non-null   int64  
 2   Chest pain type          1025 non-null   int64  
 3   BP                       1025 non-null   int64  
 4   Cholesterol              1025 non-null   int64  
 5   FBS over 120             1025 non-null   int64  
 6   EKG results              1025 non-null   int64  
 7   Max HR                   1025 non-null   int64  
 8   Exercise angina          1025 non-null   int64  
 9   ST depression            1025 non-null   float64
 10  Slope of ST              1025 non-null   int64  
 11  Number of vessels fluro  1025 non-null   int64  
 12  Thallium                 1025 non-null   int64  
 13  Heart Disease            1025 non-null   int64  
dtypes: float64(1), int64(13)

## Menyiapkan dan menjalankan model

In [18]:
# Membuat model menggunakan CTGAN
model = CTGAN()
model.fit(data_copy)

In [19]:
# Mensintetis data sebanyak 2000 data
synthetic_data = model.sample(2000)

# Menampilkan hasil sintetis data
synthetic_data.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,49,1,3,117,154,0,0,138,1,1.790572,1,1,2,1
1,40,1,1,120,230,0,1,106,1,2.55487,1,1,3,0
2,39,0,3,127,233,0,0,161,1,3.707401,2,1,3,0
3,48,1,1,130,315,0,0,191,1,-0.362492,1,0,2,0
4,60,0,2,126,165,0,0,126,1,0.498273,1,0,2,1


## Memproses data agar sesuai dengan data original

In [20]:
# Hapus baris yang memiliki nilai negatif pada kolom 'ST depression'
synthetic_data = synthetic_data[synthetic_data['ST depression'] >= 0]

# Ubah nilai pada kolom 'ST depression' menjadi angka dengan 1 angka di belakang koma
synthetic_data['ST depression'] = synthetic_data['ST depression'].round(1)

# Reset index dan menghapus index lama
synthetic_data = synthetic_data.reset_index(drop=True)

# Decode kolom 'Heart Disease' berisi 1 untuk 'Presence' dan 0 untuk 'Absence'
synthetic_data['Heart Disease'] = synthetic_data['Heart Disease'].map({1: 'Presence', 0: 'Absence'})

# Menampilkan 5 baris pertama dari data yang sudah diproses
synthetic_data.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,49,1,3,117,154,0,0,138,1,1.8,1,1,2,Presence
1,40,1,1,120,230,0,1,106,1,2.6,1,1,3,Absence
2,39,0,3,127,233,0,0,161,1,3.7,2,1,3,Absence
3,60,0,2,126,165,0,0,126,1,0.5,1,0,2,Presence
4,54,1,1,129,303,1,1,169,1,0.6,1,0,2,Absence


In [21]:
# Menampilkan informasi tentang data termasuk tipe data
synthetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1550 entries, 0 to 1549
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      1550 non-null   int64  
 1   Sex                      1550 non-null   int64  
 2   Chest pain type          1550 non-null   int64  
 3   BP                       1550 non-null   int64  
 4   Cholesterol              1550 non-null   int64  
 5   FBS over 120             1550 non-null   int64  
 6   EKG results              1550 non-null   int64  
 7   Max HR                   1550 non-null   int64  
 8   Exercise angina          1550 non-null   int64  
 9   ST depression            1550 non-null   float64
 10  Slope of ST              1550 non-null   int64  
 11  Number of vessels fluro  1550 non-null   int64  
 12  Thallium                 1550 non-null   int64  
 13  Heart Disease            1550 non-null   object 
dtypes: float64(1), int64(12)

## Menggabungkan dan menyimpan data sintetis dengan data asli

In [22]:
# Simpan hasil sintetis data ke dalam file CSV baru
synthetic_data.to_csv('Heart_Disease_Prediction_Synthetic.csv', index=False)

# Gabungkan data asli dan data sintetik
data_combined = pd.concat([data, synthetic_data], ignore_index=True)

# Simpan hasil gabungan ke dalam file CSV baru
data_combined.to_csv('Heart_Disease_Prediction_Combined.csv', index=False)

# Komparasi jumlah baris data asli, data sintetis, dan gabungan kedua data
data_length = len(data)
data_synthetic_length = len(synthetic_data)
data_combined_length = len(data_combined)
print("Total baris data asli:", data_length)
print("Total baris data sintetis:", data_synthetic_length)
print("Total baris gabungan data:", data_combined_length)

Total baris data asli: 1025
Total baris data sintetis: 1550
Total baris gabungan data: 2575
