# Regresi Menggunakan ANN dan SVR

## 1. Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Import Data

In [2]:
# Import data from CSV file
dataset = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSsYQKtYLv-r7FFfqneDqIjeBpws2ftD1F8eEXKY-p-tOGdI_vVyn2Q-wnMlbXdYerRX1Uhao555u_g/pub?gid=349111382&single=true&output=csv')

### >> **Menampilkan 5 data teratas**

In [3]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


#### >> **Menampilkan informasi dataset**

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


### >> **Membuat Dataframe Baru**
Pada dataframe baru hanya terdapat kolom yang akan digunakan untuk pemodelan dimana dalam hal ini yang akan digunakan yaitu kolom age, avg_glucose_level, dan bmi

In [5]:
df = dataset.drop(columns=['id','gender','hypertension','heart_disease','ever_married', 'work_type', 'Residence_type', 'smoking_status','stroke'])

## 3. Data Preprocessing

### >> **Missing Value**

*Jumlah Missing Value*

In [6]:
df.isnull().sum()

age                    0
avg_glucose_level      0
bmi                  201
dtype: int64

Terdapat 201 baris yang kosong pada kolom bmi

*Handling Missing Value*

Baris yang kosong pada kolom bmi tersebut akan diisi dengan nilai mean dari kolom bmi.

In [7]:
# Handling missing value
df['bmi'].fillna(dataset['bmi'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(dataset['bmi'].mean(), inplace=True)


*Missing Value Setelah Handling*

In [8]:
df.isnull().sum()

age                  0
avg_glucose_level    0
bmi                  0
dtype: int64

Setelah melakukan handling sudah tidak terdapat missing value pada kolom bmi

### >> **Data Duplicated**

*Menampilkan Jumlah Data Duplicated*

In [9]:
# Melihat jumlah data duplicated
print(df.duplicated().sum())

0


Pada dataset tersebut tidak terdapat duplicated data

### >> **Menampilkan Statistical Summary dari Data**

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5110.0,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
avg_glucose_level,5110.0,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,5110.0,28.893237,7.698018,10.3,23.8,28.4,32.8,97.6


Berdasarkan tabel diatas dapat dilihat bahwa data pada dataset tersebut belum berada pada range yang sama. Hal ini perlu diatasi agar model yang dihasilkan memberikan nilai evaluasi terbaik.

### >> **Feature Scaling**
Feature Scaling akan dilakukan menggunakan metode min max scaler dimana nilai min max data akan berada dalam range yang sama.

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Fitting scaler pada data dan kemudian melakukan transformasi
df= pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,5110.0,0.526692,0.276033,0.0,0.304199,0.54834,0.743652,1.0
avg_glucose_level,5110.0,0.235563,0.209046,0.0,0.102137,0.169721,0.272228,1.0
bmi,5110.0,0.212981,0.088179,0.0,0.154639,0.207331,0.257732,1.0


### >> **Memilih Variabel X dan Y**
Variabel independen X akan diisi dengan kolom atribut/fitur yaitu kolom age dan avg_glucose_level sedangkan variabel dependen y akan diisi dengan kolom bmi

In [13]:
# Select Attribute X and Y
X = df.drop(columns=['bmi'])
y = df['bmi']

### >> **Splitting Data**

In [14]:
from sklearn.model_selection import train_test_split
# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 4. Membangun Model ANN

*Inisialisasi Model ANN yang akan dibangun*

In [15]:
# Mengimpor library Keras dan turunannya
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# Inisialisasi ANN
model = Sequential()
# Menambah input layer dan hidden layer pertama
model.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'relu', input_dim = 2))
# Menambah hidden layer kedua
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'relu'))
# Menambah output layer
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

*Training Model ANN*

In [16]:
# Meenjalankan ANN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['mean_squared_error'])
# Fitting ANN ke training set
model.fit(X_train, y_train, batch_size = 10, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x25981bd99c0>

*Testing Model ANN*

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Langkah 5: Evaluasi kinerja model
y_pred = model.predict(X_test)
# Menghitung Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', mae)

# Menghitung Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

# Menghitung Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print('Root Mean Squared Error:', rmse)

Mean Absolute Error: 0.06683794691470422
Mean Squared Error: 0.007782708362124884
Root Mean Squared Error: 0.08821965972573735


## 5. Model SVR

*Training Model SVR*

In [18]:
# Fitting Regression modelto the dataset
from sklearn.svm import SVR
regressor_linear = SVR(kernel='linear') # add this parametre kernel='rbf'
regressor_non_linear = SVR(kernel='rbf') # add this parametre kernel='rbf'
regressor_linear.fit(X_train, y_train)
regressor_non_linear.fit(X_train, y_train)


*Hasil Metric Evaluation Model SVR*

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Melakukan prediksi langsung pada nilai fitur yang diberikan
y_pred_linear = regressor_linear.predict(X_test)
y_pred_non_linear = regressor_non_linear.predict(X_test)
# Menghitung Mean Absolute Error (MAE)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
print('Mean Absolute Error Linear:', mae_linear)

# Menghitung Mean Squared Error (MSE)
mse_linear = mean_squared_error(y_test, y_pred_linear)
print('Mean Squared Error Linear:', mse_linear)

# Menghitung Root Mean Squared Error (RMSE)
rmse_linear = np.sqrt(mse_linear)
print('Root Mean Squared Error Linear:', rmse_linear)

# Menghitung Mean Absolute Error (MAE)
mae_non_linear = mean_absolute_error(y_test, y_pred_non_linear)
print('Mean Absolute Error Non Linear:', mae_non_linear)

# Menghitung Mean Squared Error (MSE)
mse_non_linear = mean_squared_error(y_test, y_pred_non_linear)
print('Mean Squared Error Non Linear:', mse_non_linear)

# Menghitung Root Mean Squared Error (RMSE)
rmse_non_linear = np.sqrt(mse_non_linear)
print('Root Mean Squared Error Non Linear:', rmse_non_linear)

Mean Absolute Error Linear: 0.06539853569519931
Mean Squared Error Linear: 0.0069007297115937696
Root Mean Squared Error Linear: 0.0830706308606945
Mean Absolute Error Non Linear: 0.05902709217058484
Mean Squared Error Non Linear: 0.005730613337438222
Root Mean Squared Error Non Linear: 0.07570081464184003
