In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one random data point per category for testing
for category in df['Verifikasi'].unique():
    # Get all data points for the current category
    category_data = df[df['Verifikasi'] == category]
    
    # Split the data for testing and training
    category_train, category_test = train_test_split(category_data, test_size=0.25, random_state=92)
    
    # Add to training set
    df_train = pd.concat([df_train, category_train])
    
    # Add to testing set
    df_test = pd.concat([df_test, category_test])

# Display the test data
print("\nTest Data:")
print(df_test)

# Display the training data
#print("\nTraining Data:")
#print(df_train)

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str).reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str).reset_index(drop=True)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Handle missing values in numerical features
X_train = X_train.fillna(0)  # Replace NaN with 0 for simplicity
X_test = X_test.fillna(0)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, df_train['Verifikasi'])

# Prediction
y_pred = rf_model.predict(X_test)
print("=========")
print("Prediksi")
print("====================")
for x in y_pred:
    print(x)

# Evaluation
accuracy = accuracy_score(df_test['Verifikasi'], y_pred)
print(f'\nAccuracy on Test Data: {accuracy:.2f}')

print('\nClassification Report on Test Data:')
print(classification_report(df_test['Verifikasi'], y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      210        SHF
2  Pembyaran S                                   ...      200        SHF
3                                          abcabcabc      200        SHF
4                        settlement cabang jabotabek      250    BUF/BUP

Test Data:
                      Deskripsi  Nominal Verifikasi
0                Pembayaran SHF      200        SHF
4   settlement cabang jabotabek      250    BUF/BUP
8                       fiducia  2000000    fidusia
12                      pinalti  1000000    pinalty
16                        umk 3  1000000       umk3
Prediksi
SHF
SHF
fidusia
BUF/BUP
umk3

Accuracy on Test Data: 0.60

Classification Report on Test Data:
              precision    recall  f1-score   support

     BUF/BUP       0.00      0.00      0.00         1
         SHF       0.50  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
print(y_pred)

['SHF' 'SHF' 'fidusia' 'BUF/BUP' 'umk3']
