1. Tentukan Library yang akan digunakan

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

2. Load Dataset

In [13]:
stunting_dataset = pd.read_csv('Stunting_Dataset.csv')

In [14]:
stunting_dataset.head()

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,Male,17,3.0,49,10.0,72.2,No,No
1,Female,11,2.9,49,2.9,65.0,No,Yes
2,Male,16,2.9,49,8.5,72.2,No,Yes
3,Male,31,2.8,49,6.4,63.0,No,Yes
4,Male,15,3.1,49,10.5,49.0,No,Yes


In [15]:
from IPython.display import display
display(stunting_dataset)

Unnamed: 0,Gender,Age,Birth Weight,Birth Length,Body Weight,Body Length,Breastfeeding,Stunting
0,Male,17,3.0,49,10.0,72.2,No,No
1,Female,11,2.9,49,2.9,65.0,No,Yes
2,Male,16,2.9,49,8.5,72.2,No,Yes
3,Male,31,2.8,49,6.4,63.0,No,Yes
4,Male,15,3.1,49,10.5,49.0,No,Yes
...,...,...,...,...,...,...,...,...
9995,Male,15,3.0,49,9.0,63.0,No,Yes
9996,Female,12,2.8,48,7.7,63.0,No,No
9997,Male,16,2.8,49,7.7,49.0,No,No
9998,Male,14,2.8,49,10.0,69.0,No,Yes


In [16]:
stunting_dataset.shape 

(10000, 8)

In [17]:
stunting_dataset.isnull().sum()

Gender           0
Age              0
Birth Weight     0
Birth Length     0
Body Weight      0
Body Length      0
Breastfeeding    0
Stunting         0
dtype: int64

In [18]:
stunting_dataset['Gender'] = stunting_dataset['Gender'].replace({'Female':1,'Male':0})
stunting_dataset['Breastfeeding'] = stunting_dataset['Breastfeeding'].replace({'No':0,'Yes':1})
stunting_dataset['Stunting'] = stunting_dataset['Stunting'].replace({'No':0,'Yes':1})

In [19]:
# Step 2: Tentukan fitur yang menjadi dasar pencocokan
feature_cols = ["Gender", "Age", "Birth Weight", "Birth Length", "Body Weight", "Body Length", "Breastfeeding"]

# Step 3: Kelompokkan berdasarkan fitur dan ambil mayoritas label
df_cleaned = (
    stunting_dataset.groupby(feature_cols, as_index=False)
    .agg({"Stunting": lambda x: x.value_counts().idxmax()})  # ambil label mayoritas
)

# Step 4: Bandingkan sebelum dan sesudah
print("Jumlah data sebelum pembersihan:", len(stunting_dataset))
print("Jumlah data sesudah pembersihan:", len(df_cleaned))
print("Baris yang dibuang karena konflik label:", len(stunting_dataset) - len(df_cleaned))

Jumlah data sebelum pembersihan: 10000
Jumlah data sesudah pembersihan: 7205
Baris yang dibuang karena konflik label: 2795


In [20]:
df_cleaned['Stunting'].value_counts() 

Stunting
1    5976
0    1229
Name: count, dtype: int64

In [21]:
# memisahkan data dan label
X = df_cleaned.drop (columns='Stunting', axis=1)
Y = df_cleaned['Stunting'] 

In [22]:
print(X) 

      Gender  Age  Birth Weight  Birth Length  Body Weight  Body Length  \
0          0    6           2.0            49          7.1         71.0   
1          0    6           2.0            49          7.7         73.5   
2          0    6           2.0            49          8.4         65.0   
3          0    6           2.0            49          8.5         63.0   
4          0    6           2.0            49         10.5         92.7   
...      ...  ...           ...           ...          ...          ...   
7200       1   48           3.0            49         10.5         76.0   
7201       1   48           3.0            49         10.5         80.0   
7202       1   48           3.1            49          6.2         73.5   
7203       1   48           3.1            49          7.1         65.0   
7204       1   48           3.1            49          8.0         80.0   

      Breastfeeding  
0                 0  
1                 0  
2                 0  
3          

In [23]:
print(Y)

0       0
1       0
2       1
3       1
4       1
       ..
7200    1
7201    1
7202    1
7203    1
7204    1
Name: Stunting, Length: 7205, dtype: int64


3. Standarisasi Data

In [24]:
scaler = StandardScaler()

In [25]:
scaler.fit(X)

In [26]:
standarized_data = scaler.transform(X) 

In [27]:
print(standarized_data)

[[-0.8084929  -1.03710667 -2.53785501 ... -0.29481492  0.18917538
   0.        ]
 [-0.8084929  -1.03710667 -2.53785501 ...  0.0420563   0.45167202
   0.        ]
 [-0.8084929  -1.03710667 -2.53785501 ...  0.43507274 -0.44081657
   0.        ]
 ...
 [ 1.23686924  3.7780712   1.12385442 ... -0.80012177  0.45167202
   0.        ]
 [ 1.23686924  3.7780712   1.12385442 ... -0.29481492 -0.44081657
   0.        ]
 [ 1.23686924  3.7780712   1.12385442 ...  0.21049192  1.1341633
   0.        ]]


In [28]:
X = standarized_data
Y = df_cleaned['Stunting'] 

In [29]:
print(X)
print(Y)

[[-0.8084929  -1.03710667 -2.53785501 ... -0.29481492  0.18917538
   0.        ]
 [-0.8084929  -1.03710667 -2.53785501 ...  0.0420563   0.45167202
   0.        ]
 [-0.8084929  -1.03710667 -2.53785501 ...  0.43507274 -0.44081657
   0.        ]
 ...
 [ 1.23686924  3.7780712   1.12385442 ... -0.80012177  0.45167202
   0.        ]
 [ 1.23686924  3.7780712   1.12385442 ... -0.29481492 -0.44081657
   0.        ]
 [ 1.23686924  3.7780712   1.12385442 ...  0.21049192  1.1341633
   0.        ]]
0       0
1       0
2       1
3       1
4       1
       ..
7200    1
7201    1
7202    1
7203    1
7204    1
Name: Stunting, Length: 7205, dtype: int64


4. Memisahkan Data Training dan Data Testing

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.3, stratify=Y, random_state=2)

In [31]:
print(X.shape, X_train.shape, X_test.shape)

(7205, 7) (5043, 7) (2162, 7)


5. Membuat data latih menggunakan algoritma Random Forest


In [32]:
RF_classifier = RandomForestClassifier(n_estimators=100,random_state=42)



In [33]:
RF_classifier.fit(X_train,Y_train)

6. Membuat model evaluasi untuk mengukur tingkat akurasi

In [34]:
rf_predict = RF_classifier.predict(X_train)
training_data_accuracy = accuracy_score(rf_predict,Y_train)


In [35]:
print('Akurasi data training adalah =', training_data_accuracy)

Akurasi data training adalah = 1.0


In [36]:
rf_predict = RF_classifier.predict(X_test)
test_data_accuracy = accuracy_score(rf_predict,Y_test)

In [37]:
print('Akurasi data testing adalah =', test_data_accuracy)

Akurasi data testing adalah = 0.8464384828862165


7. Membuat model prediksi

In [38]:
input_data = (0,6,2.0,49,7.1,71.0,0)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = RF_classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('Anak tidak berisiko stunting')
else :
    print('Anak berisiko stunting') 

[[-0.8084929  -1.03710667 -2.53785501 -0.18229332 -0.29481492  0.18917538
   0.        ]]
[0]
Anak tidak berisiko stunting




8. Simpan model

In [153]:
import pickle

In [154]:

pickle.dump(RF_classifier, open('stunting.sav', 'wb'))
pickle.dump(scaler, open('scaler.sav', 'wb'))