<a href="https://colab.research.google.com/github/Mr94t3z/pembelajaran-mesin/blob/master/meeting_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Tree

https://towardsdatascience.com/everything-you-ever-wanted-to-know-about-decision-trees-in-python-17e8edb6b37b

In [None]:
import pandas as pd
import numpy as np
import graphviz
import pydotplus
import matplotlib.image as mpimg
import io
import random

from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing, tree, datasets
!pip install dtreeviz
#from dtreeviz.trees import dtreeviz

pd.set_option('display.max_rows', 10)
random.seed(24)

In [None]:
df_red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
df_white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

df_red_wine['label'] = 1
df_white_wine['label'] = 0

df_merged_wine = pd.concat([df_red_wine, df_white_wine])
df_merged_wine

In [None]:
df_white_wine.head()

In [None]:
df_merged_wine['label'].value_counts(normalize=True)

# tidak seimbang label 0 dan 1 nya

In [None]:
# down-sampling commonly technique to balancing the dataset

red_wines = df_merged_wine[df_merged_wine['label'] == 1]
all_white_wines = df_merged_wine[df_merged_wine['label'] == 0]

white_wines = all_white_wines.sample(n=red_wines.shape[0], random_state=24)

df_wine_balanced = pd.concat([red_wines, white_wines])

df_wine_balanced

In [None]:
df_wine_balanced['label'].value_counts(normalize=True)

# dataset sudah balance

In [None]:
# pembagian data train dan test

X = df_wine_balanced.drop('label', axis=1)
y = df_wine_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

In [None]:
# Tuning the Model

def dtree_grid_search(X, y, nfolds):

    param_grid = {'criterion':['gini','entropy'], 
                  'class_weight':['balanced',None], 
                  'splitter':['best','random'], 
                  'max_features':['auto', 'sqrt', 'log2', None], 
                  'max_depth': np.arange(3, 15)}

    dtree_model=DecisionTreeClassifier(random_state=24)
    dtree_gscv = GridSearchCV(dtree_model, param_grid, cv=nfolds)
    dtree_gscv.fit(X, y)
    
    return pd.DataFrame(dtree_gscv.best_params_, index=['Value']).T

dtree_grid_search(X_train, y_train, 10)

In [None]:
# Creating the Decision Tree Model

classifier = tree.DecisionTreeClassifier(class_weight=None, 
                                         criterion='entropy', 
                                         max_depth=3,
                                         max_features='auto', 
                                         splitter='best', 
                                         random_state=24)
model = classifier.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# Feature Importance

features_dict = {'feature_importances': classifier.feature_importances_, 'feature_names': X_train.columns}

pd.DataFrame(features_dict).sort_values(by='feature_importances', ascending=False).head(10)

In [None]:
# Visualising the Decision Tree
!pip install dtreeviz 
from dtreeviz.trees import dtreeviz
viz = dtreeviz(classifier, 
               X_train, 
               y_train,
               target_name='label',
               feature_names=X.columns.to_list(), 
               class_names=["red", "white"],
               scale=1.4)

viz.save("dtreeviz.svg")

viz

In [None]:
dot_data = io.StringIO()
tree.export_graphviz(classifier, out_file=dot_data, 
                     feature_names=X.columns.to_list(),  
                     filled=True,
                     class_names=['red', 'white'],
                     rounded=True)

filename = "graphviz.png"
pydotplus.graph_from_dot_data(dot_data.getvalue()).write_png(filename)

plt.figure(figsize=(25,10))
plt.box(False)

img = mpimg.imread(filename)
fig = plt.imshow(img)
fig.axes.get_xaxis().set_visible(False)
fig.axes.get_yaxis().set_visible(False)

plt.show()

In [None]:
# Using the Model to Make Predictions

def predict_single_label(fixed_acidity, 
                         volatile_acidity, 
                         citric_acid, 
                         residual_sugar, 
                         chlorides, 
                         free_sulfur_dioxide, 
                         total_sulfur_dioxide, 
                         density, 
                         pH, 
                         suplhates, 
                         alcohol, 
                         quality):
    y_predict = classifier.predict([[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides, free_sulfur_dioxide, total_sulfur_dioxide, density, pH, suplhates, alcohol, quality]])[0]
    return "red" if y_predict == 1 else "white"

In [None]:
df_wine_balanced.head(1)
df_wine_balanced.tail(1)

In [None]:
test1 = predict_single_label(7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4, 5)
test2 = predict_single_label(6.9, 0.32, 0.17, 7.6, 0.042, 69.0, 219.0, 0.9959, 3.13, 0.4, 8.9, 5)
test1, test2

In [None]:
# another prediction

df_predictions = X_test.copy()
df_predictions['label'] = y_test
df_predictions['predicted_label'] = classifier.predict(X_test)

df_predictions

In [None]:
filter_correct = df_predictions['label'] == df_predictions['predicted_label']
df_predictions_correct = df_predictions.loc[filter_correct]

calc_score = len(df_predictions.loc[filter_correct]) / len(df_predictions)
model_score = model.score(X_test, y_test)

calc_score, model_score

In [None]:
# deploy model to production

text_representation = tree.export_text(classifier, feature_names=X.columns.to_list())
print(text_representation)

# Random Forest Classification

https://www.analyticsvidhya.com/blog/2021/06/understanding-random-forest/

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [None]:
# Reading the csv file and putting it into 'df' object

url = "https://raw.githubusercontent.com/Mr94t3z/pembelajaran-mesin/master/datasets/heart_v2.csv"
df = pd.read_csv(url)

df.head()

In [None]:
# Putting feature variable to X
X = df.drop('heart disease',axis=1)
# Putting response variable to y
y = df['heart disease']

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(
    random_state=42, 
    n_jobs=-1, 
    max_depth=5,
    n_estimators=100, 
    oob_score=True
    )

classifier_rf.fit(X_train, y_train)

In [None]:
classifier_rf.oob_score_

In [None]:
# hyperparameter tuning for Random Forest using GridSearchCV and fit the data

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

from sklearn.model_selection import GridSearchCV

# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=params,
    cv = 4,
    n_jobs=-1,
    verbose=1, 
    scoring="accuracy"
    )

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(80,40))
plot_tree(rf_best.estimators_[5], feature_names = X.columns,class_names=['Disease', "No Disease"],filled=True);

In [None]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf_best.feature_importances_
})

imp_df.sort_values(by="Imp", ascending=False)

# K-Nearest Neighbor

https://towardsdatascience.com/multiclass-classification-using-k-nearest-neighbours-ca5281a9ef76

https://medium.com/geekculture/k-nearest-neighbors-a-to-z-with-implementation-in-python-74630ffb79a2#:~:text=K%2DNearest%20Neighbors%20(kNN)%20is%20a%20Machine%20Learning%20algorithm,and%20simple%20Machine%20Learning%20algorithms.

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# Reading the csv file and putting it into 'df' object

url = "https://raw.githubusercontent.com/Mr94t3z/pembelajaran-mesin/master/datasets/iris.csv"
df = pd.read_csv(url)

df.head()

In [None]:
df['variety'].unique()

In [None]:
df.isnull().values.any()

In [None]:
df['variety'] = df['variety'].map({'Setosa' :0, 'Versicolor' :1, 'Virginica' :2}).astype(int) #mapping numbers
df.head()

In [None]:
plt.close();
sns.set_style('whitegrid');
sns.pairplot(df, hue='variety', height=3);
plt.show()

In [None]:
sns.set_style('whitegrid');
sns.FacetGrid(df, hue='variety', size=5) \
.map(plt.scatter, 'sepal.length', 'sepal.width') \
.add_legend();
plt.show()

After the EDA and before training our model on the dataset, the one last thing left to do is normalisation. Normalisation is basically bringing all the values of different features on a same scale. As different features has different scale, normalising helps us and the model to optimise it’s parameters more efficiently. We normalise all our input from scale: 0 to 1. Here, X is our inputs(hence dropping the classified species) and Y is our output(3 classes).

In [None]:
x_data = df.drop(['variety'],axis=1)
y_data = df['variety']
MinMaxScaler = preprocessing.MinMaxScaler()
X_data_minmax = MinMaxScaler.fit_transform(x_data)
data = pd.DataFrame(X_data_minmax,columns=['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])
df.head(100)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y_data,test_size=0.2, random_state = 1)
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
ypred=knn_clf.predict(X_test)
ypred

In [None]:
print(knn_clf.predict([[0.416667, 0.833333, 0.033898, 0.041667]]))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, ypred)
print('Confusion Matrix:')
print(result)
result1 = classification_report(y_test, ypred)
print('Classification Report:',)
print (result1)
result2 = accuracy_score(y_test,ypred)
print('Accuracy:',result2)

In [None]:
import pandas as pd

n_neighbors = 15

evals = []
for n_neighbors in range(1, 30, 2):
    clf = KNeighborsClassifier(n_neighbors)
    clf.fit(X_train, y_train) 
    score = accuracy_score(clf.predict(X_train), y_train)
    evals.append({'k': n_neighbors, 'accuracy': score})


evals = pd.DataFrame(evals)
best_k = evals.sort_values(by='accuracy', ascending=False).iloc[0]
plt.figure(figsize=(16, 8))
plt.plot(evals['k'], evals['accuracy'], lw=3, c='#087E8B')
plt.scatter(best_k['k'], best_k['accuracy'], s=200, c='#087E8B')
plt.title(f"K Parameter Optimization, Optimal k = {int(best_k['k'])}", size=20)
plt.xlabel('K', size=14)
plt.ylabel('Accuracy', size=14)
plt.show()

https://towardsdatascience.com/confusion-matrix-for-your-multi-class-machine-learning-model-ff9aa3bf7826

# Naive Bayes Classifier

In [None]:
df_wine_balanced

In [None]:
# Variabel independen
x = df_wine_balanced.drop(["label"], axis = 1)
x.head()
# Variabel dependen
y = df_wine_balanced["label"]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

In [None]:
from sklearn.naive_bayes import GaussianNB
# Mengaktifkan/memanggil/membuat fungsi klasifikasi Naive Bayes Gaussian
modelnb = GaussianNB()
# Memasukkan data training pada fungsi klasifikasi Naive Bayes
nbtrain = modelnb.fit(x_train, y_train)

In [None]:
# Menentukan hasil prediksi dari x_test
y_pred = nbtrain.predict(x_test)
y_pred

In [None]:
np.array(y_test)

In [None]:
# # Menentukan probabilitas hasil prediksi
nbtrain.predict_proba(x_test)

In [None]:
# confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


# confusion matrix with multiple class ML

https://towardsdatascience.com/confusion-matrix-for-your-multi-class-machine-learning-model-ff9aa3bf7826

In [None]:
#importing a 3-class dataset from sklearn's toy dataset
from sklearn.datasets import load_wine

dataset = load_wine()
X = dataset.data
y = dataset.target
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(kernel='rbf', C=1).fit(X_train, y_train)
y_pred = svc.predict(X_test)

#importing confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3']))