In [3]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.2, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         SHF       1.00      1.00      1.00         2
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [4]:
print(y_pred)

['SHF' 'umk3' 'pinalty' 'SHF']


In [6]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.2, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Display y_test indices and values
print("\ny_test indices and values:")
print(y_test.reset_index(drop=True))
print('\n')

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test indices and values:
0        SHF
1       umk3
2    pinalty
3        SHF
Name: Verifikasi, dtype: object


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         SHF       1.00      1.00      1.00         2
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [7]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.2, random_state=42)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(y_test.index)
print('\n')

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
Int64Index([0, 17, 15, 1], dtype='int64')


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         SHF       1.00      1.00      1.00         2
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [8]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.25, random_state=42)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(y_test.index)
print('\n')

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
Int64Index([0, 17, 15, 1, 8], dtype='int64')


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         SHF       1.00      1.00      1.00         2
     fidusia       1.00      1.00      1.00         1
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [9]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.25, random_state=14)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(y_test.index)
print('\n')

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
Int64Index([0, 17, 15, 1, 8, 5], dtype='int64')


Accuracy: 0.83

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       0.00      0.00      0.00         1
         SHF       0.67      1.00      0.80         2
     fidusia       1.00      1.00      1.00         1
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           0.83         6
   macro avg       0.73      0.80      0.76         6
weighted avg 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.25, random_state=14)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(y_test.index)
print('\n')

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
Int64Index([19, 5, 2, 1, 4], dtype='int64')


Accuracy: 0.60

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       0.00      0.00      0.00         2
         SHF       0.50      1.00      0.67         2
        umk3       1.00      1.00      1.00         1

    accuracy                           0.60         5
   macro avg       0.50      0.67      0.56         5
weighted avg       0.40      0.60      0.47         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]], ignore_index=True)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(df_test.index)
print('\n')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Split the data
y_train = df_train['Verifikasi']
y_test = df_test['Verifikasi']

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
RangeIndex(start=0, stop=5, step=1)


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       1.00      1.00      1.00         1
         SHF       1.00      1.00      1.00         1
     fidusia       1.00      1.00      1.00         1
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00  

In [14]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]], ignore_index=True)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(df_test.index)
print('\n')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Split the data
y_train = df_train['Verifikasi']
y_test = df_test['Verifikasi']

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
RangeIndex(start=0, stop=5, step=1)


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       1.00      1.00      1.00         1
         SHF       1.00      1.00      1.00         1
     fidusia       1.00      1.00      1.00         1
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00  

In [17]:
print(X_test)

   0  1  2  3  4  5  6  7  8  9  ...  13  14  15  16  17  18  19  20  21  \
0  0  0  0  0  0  0  0  0  0  1  ...   0   0   0   0   0   0   0   0   0   
1  0  0  0  0  0  0  0  1  0  0  ...   0   0   0   1   0   0   1   0   0   
2  0  0  0  0  0  1  0  0  0  0  ...   0   0   0   0   0   0   0   0   0   
3  0  0  0  0  0  0  0  0  0  0  ...   1   0   0   0   0   0   0   0   0   
4  0  0  0  0  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   1   

   Nominal  
0      200  
1      300  
2  2001000  
3  2000000  
4  2000000  

[5 rows x 23 columns]


In [18]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]], ignore_index=True)

# Display the data points used for testing
print("\nData points used for testing:")
print(df_test)

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(df_test.index)
print('\n')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Split the data
y_train = df_train['Verifikasi']
y_test = df_test['Verifikasi']

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

Data points used for testing:
             Deskripsi  Nominal Verifikasi
0  Pembayaran S    H F      200        SHF
1   sett      tle ment      300    BUF/BUP
2              fiducia  2001000    fidusia
3              pinalty  2000000    pinalty
4                umk 3  2000000       umk3

y_test original indices:
RangeIndex(start=0, stop=5, step=1)



Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       1.00      1.00      1.00         1
         SHF       1.00      1.00      1.00         1
     f

In [19]:
print(y_pred)

['SHF' 'BUF/BUP' 'fidusia' 'pinalty' 'umk3']
