In [2]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Better dummy data
data_a = {
    'Amount': [100000, 50000, 70000, 120000, 80000],
    'Description': [
        'pembayaran denda angsuran mobil',
        'pembayaran stnk mobil',
        'pembayaran asuransi',
        'pembayaran angsuran mobil',
        'pembayaran pajak mobil'
    ]
}

data_b = {
    'Customer': ['a', 'b', 'c', 'd'],
    'PaymentStatus': [
        'telah membayar denda angsuran mobil',
        'telah membayara pengurusan stnk mobil',
        'telah membayar asuransi bulanan',
        'telah membayar angsuran mobil'
    ]
}

df_a = pd.DataFrame(data_a)
df_b = pd.DataFrame(data_b)

# Labeling data in df_a based on df_b
df_a['Verified'] = df_a['Description'].apply(lambda x: any(customer in x for customer in df_b['Customer']))

# Encode the labels
label_encoder = LabelEncoder()
df_a['EncodedLabel'] = label_encoder.fit_transform(df_a['Verified'])

# Text Preprocessing
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_a['Description'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df_a['EncodedLabel'], test_size=0.2, random_state=42)

# SVM Model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Prediction
y_pred = svm_model.predict(X_test)

# Inverse transform the predicted labels to original classes
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluation
accuracy = accuracy_score(df_a['EncodedLabel'].loc[X_test.index], y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(df_a['EncodedLabel'].loc[X_test.index], y_pred))


ValueError: The number of classes has to be greater than one; got 1 class

In [3]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Better dummy data
data_a = {
    'Amount': [100000, 50000, 70000, 120000, 80000],
    'Description': [
        'pembayaran denda angsuran mobil',
        'pembayaran stnk mobil',
        'pembayaran asuransi',
        'pembayaran angsuran mobil',
        'pembayaran pajak mobil'
    ]
}

data_b = {
    'Customer': ['a', 'b', 'c', 'd'],
    'PaymentStatus': [
        'telah membayar denda angsuran mobil',
        'telah membayara pengurusan stnk mobil',
        'telah membayar asuransi bulanan',
        'telah membayar angsuran mobil'
    ]
}

df_a = pd.DataFrame(data_a)
df_b = pd.DataFrame(data_b)

# Labeling data in df_a based on df_b
df_a['Verified'] = df_a['Description'].apply(lambda x: any(customer in x for customer in df_b['Customer']))

# Ensure both classes are present in the labels
df_a['Verified'] = df_a['Verified'].astype('category')

# Text Preprocessing
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_a['Description'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df_a['Verified'], test_size=0.2, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [3]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Your data
data = {
    'Deskripsi': ["Pembayaran SHF", "Pembayaran S    H F", "Pembyaran S                                    H                        F",
                  "abcabcabc", "settlement cabang jabotabek", "sett      tle ment", "setlement", "setelment",
                  "fiducia", "fiducia", "abcabacaca", "abcabacaca", "pinalti", "pinalty", "pi nal ty", "pi nal ty",
                  "umk 3", "umk 3", "u     !@#     m    !@#       k  3", "abc umk 3 abca"],
    'Nominal': [200, 200, 210, 250, 200, 300, 500, 1000000, 2000000, 2001000, 2000000, 2100000,
                1000000, 2000000, 3000000, 3000000, 1000000, 2000000, 3000000, 3000000, 3000000],
    'Verifikasi': ["SHF", "SHF", "SHF", "SHF", "BUF/BUP", "BUF/BUP", "BUF/BUP", "BUF/BUP",
                   "fidusia", "fidusia", "fidusia", "fidusia", "pinalty", "pinalty", "pinalty", "pinalty",
                   "umk3", "umk3", "umk3", "umk3"]
}

df = pd.read_csv('dummy.csv')

# Text Preprocessing
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_combined = pd.concat([pd.DataFrame(X.toarray()), df['Nominal']], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_combined, df['Verifikasi'], test_size=0.2, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.