In [1]:
# Remove warning
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

# read data
df = pd.read_csv("cleaned3.csv")

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["Cleaned"])
y = df["Preferensi"]

# Splitting data
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=28, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=40, test_size=0.2)

# Import Random Forest
from sklearn.ensemble import RandomForestClassifier

# Train the X
model = RandomForestClassifier()
model.fit(X_train, y_train)

# evaluation
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                                                                                         precision    recall  f1-score   support

                              ['Chocobanana (es coklat dengan campuran perisa pisang)']       0.67      0.67      0.67         3
                             ['Chocomint (es coklat dengan campuran perisa daun mint)']       0.67      0.67      0.67         3
                               ['Chocoorange (es coklat dengan campuran perisa jeruk)']       1.00      0.33      0.50         3
                                                                      ['Hot Chocolate']       0.67      0.67      0.67         3
                                          ['Hot Thai Tea (seduhan teh panas thai tea)']       0.50      1.00      0.67         1
                                                                      ['Ice Chocolate']       1.00      0.50      0.67         2
                                         ['Ice Lychee Tea (teh leci dengan buah leci)']       1.

In [27]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy", verbose=3, n_jobs=1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.622 total time=   0.2s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.730 total time=   0.2s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.778 total time=   0.1s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.833 total time=   0.2s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.722 total time=   0.1s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.541 total time=   0.4s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.730 total time=   0.4s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.8

In [29]:
print(grid_search.best_params_)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


In [28]:
# Train the X
model_tuned = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300)
model.fit(X_train, y_train)

# evaluation
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                                                                                         precision    recall  f1-score   support

                              ['Chocobanana (es coklat dengan campuran perisa pisang)']       0.60      1.00      0.75         3
                             ['Chocomint (es coklat dengan campuran perisa daun mint)']       1.00      0.67      0.80         3
                               ['Chocoorange (es coklat dengan campuran perisa jeruk)']       1.00      0.33      0.50         3
                                                                      ['Hot Chocolate']       1.00      0.67      0.80         3
                                          ['Hot Thai Tea (seduhan teh panas thai tea)']       0.50      1.00      0.67         1
                                                                      ['Ice Chocolate']       1.00      1.00      1.00         2
                                         ['Ice Lychee Tea (teh leci dengan buah leci)']       1.