In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Text Preprocessing using CountVectorizer
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Numeric features
X_numeric = pd.to_numeric(df['Nominal'], errors='coerce').fillna(0).astype(int).reset_index(drop=True)

# Merge text and numeric features
X = pd.concat([pd.DataFrame(X_text.toarray()), X_numeric], axis=1)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Verifikasi'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print y_test labels after encoding
print('\nEncoded y_test labels:')
print(y_test)

# XGBoost Model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Prediction
y_pred = xgb_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))



Encoded y_test labels:
[4 4 3 0 0 3 4 0]

Accuracy: 0.88

Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         3

    accuracy                           0.88         8
   macro avg       0.92      0.83      0.84         8
weighted avg       0.91      0.88      0.86         8

