# Tugas

## Tugas 1 

Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [2]:
import pandas as pd

df = pd.read_csv('data/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
X = df.drop(['class'],axis=1)
y = df['class']

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
X = X.apply(le.fit_transform)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=42)


### Decision Tree

In [7]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


### Decision Tree - Hyperparameter Tuning

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(estimator=dt,
                        param_grid=param_grid,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1)

grid_dt.fit(X_train, y_train)

best_hyperparams = grid_dt.best_params_
print('Best hyperparameters:\n', best_hyperparams)

best_CV_score = grid_dt.best_score_
print('Best CV accuracy'.format(best_CV_score))

best_model = grid_dt.best_estimator_

y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

print('Test set accuracy of best model: {:.3f}'.format(test_acc))




Best hyperparameters:
 {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
Best CV accuracy
Test set accuracy of best model: 1.000


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Sesuaikan rf ke set training
rf.fit(X_train, y_train)

# Memprediksi label set test
y_pred_rf = rf.predict(X_test)

#  menghitung set accuracy

acc_rf = accuracy_score(y_test, y_pred_rf)
print("Test set accuracy: {:.2f}".format(acc_rf))
print(f"Test set accuracy: {acc_rf}")


Test set accuracy: 1.00
Test set accuracy: 1.0


### Random Forest - Hyperparameter Tuning

In [10]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

from sklearn.model_selection import GridSearchCV

grid_rf = GridSearchCV(estimator=rf,
                        param_grid=param_grid,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1)

grid_rf.fit(X_train, y_train)

best_hyperparams = grid_rf.best_params_

print('Best hyperparameters:\n', best_hyperparams)

best_CV_score = grid_rf.best_score_

print('Best CV accuracy'.format(best_CV_score))

best_model = grid_rf.best_estimator_

y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

print('Test set accuracy of best model: {:.3f}'.format(test_acc))


Best hyperparameters:
 {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV accuracy
Test set accuracy of best model: 1.000


## Tugas 2

### Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test

y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy

acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


### Decision Tree - Hyperparameter Tuning

In [12]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(estimator=dt,
                        param_grid=param_grid,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1)

grid_dt.fit(X_train, y_train)

best_hyperparams = grid_dt.best_params_

print('Best hyperparameters:\n', best_hyperparams)

best_CV_score = grid_dt.best_score_

print('Best CV accuracy'.format(best_CV_score))

best_model = grid_dt.best_estimator_

y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

print('Test set accuracy of best model: {:.3f}'.format(test_acc))


Best hyperparameters:
 {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
Best CV accuracy
Test set accuracy of best model: 1.000


### AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()

# Sesuaikan ada ke set training

ada.fit(X_train, y_train)

# Memprediksi label set test

y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy

acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")


Test set accuracy: 1.00
Test set accuracy: 1.0


### AdaBoost - Hyperparameter Tuning

In [14]:

from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()

param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.05, 0.1, 0.3, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}

from sklearn.model_selection import GridSearchCV

grid_ada = GridSearchCV(estimator=ada,
                        param_grid=param_grid,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1)

grid_ada.fit(X_train, y_train)

best_hyperparams = grid_ada.best_params_

print('Best hyperparameters:\n', best_hyperparams)

best_CV_score = grid_ada.best_score_

print('Best CV accuracy'.format(best_CV_score))

best_model = grid_ada.best_estimator_

y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

print('Test set accuracy of best model: {:.3f}'.format(test_acc))


Best hyperparameters:
 {'algorithm': 'SAMME.R', 'learning_rate': 1, 'n_estimators': 50}
Best CV accuracy
Test set accuracy of best model: 1.000


## Tugas 3

In [15]:
dbt = pd.read_csv('data/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [17]:
# Pada kasus ini, agak tidak masuk akal jika beberapa parameter bernilai 0
# sebagai contoh adalah nilai 'Glucose', 'BloodPlessure' ataupun 'Insulin'.
# Sekecil apapun nilainya, setiap manusia yang hidup pasti miliki nilai-nilai tersebut

# Kita akan manipulasi nilai yang 0 dengan melakukan 'imputasi' atau mengganti nilainya dengan nilai sintetis
# Pada kasus ini, kita akan menggunakan nilai mean

# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [18]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

In [19]:
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
# Karena asumsi Gaussian NB adalah data terdistribusi secara normal,
# maka kita perlu melakukan standarisasi

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

logistic_regression = LogisticRegression(max_iter=10000, random_state=42)
svm_poly = SVC(kernel='poly', degree=3, random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)

# Buat ensemble voting
ensemble_classifier = VotingClassifier(estimators=[
    ('lr', logistic_regression),
    ('svm', svm_poly),
    ('dt', decision_tree)
], voting='hard')  # 'hard' untuk voting mayoritas

# Latih model ensemble pada data latih
ensemble_classifier.fit(X_train, y_train)

# Evaluasi model ensemble pada data uji
accuracy = ensemble_classifier.score(X_test, y_test)
print("Akurasi model ensemble:", accuracy)

Akurasi model ensemble: 0.7316017316017316
