In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]], ignore_index=True)

# Display the data points used for testing
print("\nData points used for testing:")
print(df_test)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(df_test.index)
print('\n')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Split the data
y_train = df_train['Verifikasi']
y_test = df_test['Verifikasi']

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

Data points used for testing:
             Deskripsi  Nominal Verifikasi
0  Pembayaran S    H F      200        SHF
1   sett      tle ment      300    BUF/BUP
2              fiducia  2001000    fidusia
3              pinalty  2000000    pinalty
4                umk 3  2000000       umk3

y_test original indices:
RangeIndex(start=0, stop=5, step=1)



Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       1.00      1.00      1.00         1
         SHF       1.00      1.00      1.00         1
     f

In [7]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]], ignore_index=True)

# Randomize the test data when splitting
X_train, X_test, y_train, y_test = train_test_split(df_test[['Deskripsi', 'Nominal']], df_test['Verifikasi'], test_size=0.25, shuffle=True, random_state=42)

# Concatenate y_train and y_test after train_test_split
y_train = pd.concat([y_train, y_test], ignore_index=True)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(y_test.index)
print('\n')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(pd.concat([df_train['Deskripsi'], X_train['Deskripsi']], ignore_index=True))
X_text_test = vectorizer.transform(X_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), pd.concat([df_train[['Nominal']], X_train[['Nominal']]], ignore_index=True).astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), X_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Handle missing values in numerical features
X_train = X_train.fillna(0)  # Replace NaN with 0 for simplicity
X_test = X_test.fillna(0)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
Int64Index([1, 4], dtype='int64')




ValueError: Found input variables with inconsistent numbers of samples: [83, 5]

In [None]:
df