In [2]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.2, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         SHF       1.00      1.00      1.00         2
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [3]:
print(y_test)

0         SHF
17       umk3
15    pinalty
1         SHF
Name: Verifikasi, dtype: object


In [4]:
print(y_pred)

['SHF' 'umk3' 'pinalty' 'SHF']


In [7]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.5, random_state=42)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP
Accuracy: 0.40

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       0.20      1.00      0.33         1
         SHF       0.00      0.00      0.00         3
     fidusia       0.50      1.00      0.67         2
     pinalty       1.00      1.00      1.00         1
        umk3       0.00      0.00      0.00         3

    accuracy                           0.40        10
   macro avg       0.34      0.60      0.40        10
weighted avg       0.22      0.40      0.27        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.2, random_state=42)

# Show X_train and y_train
df_train = pd.concat([X_train, pd.DataFrame({'Verifikasi': y_train})], axis=1)
print("X_train and y_train:")
print(df_train.head())

# Save X_train and y_train as CSV
df_train.to_csv('X_train_y_train.csv', index=False)

# Show X_test and y_test
df_test = pd.concat([X_test, pd.DataFrame({'Verifikasi': y_test})], axis=1)
print("\nX_test and y_test:")
print(df_test.head())

# Save X_test and y_test as CSV
df_test.to_csv('X_test_y_test.csv', index=False)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


X_train and y_train:
    0  1  2  3  4  5  6  7  8  9  ...  14  15  16  17  18  19  20  21  \
8   0  0  0  0  0  1  0  0  0  0  ...   0   0   0   0   0   0   0   0   
5   0  0  0  0  0  0  0  1  0  0  ...   0   0   1   0   0   1   0   0   
11  0  0  1  0  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   
3   0  0  0  1  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   
18  0  0  0  0  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   0   

    Nominal  Verifikasi  
8   2000000     fidusia  
5       300     BUF/BUP  
11  2100000     fidusia  
3       250         SHF  
18  3000000        umk3  

[5 rows x 24 columns]

X_test and y_test:
    0  1  2  3  4  5  6  7  8  9  ...  14  15  16  17  18  19  20  21  \
0   0  0  0  0  0  0  0  0  0  1  ...   0   0   0   0   1   0   0   0   
17  0  0  0  0  0  0  0  0  0  0  ...   0   0   0   0   0   0   0   1   
15  0  0  0  0  0  0  0  0  1  0  ...   0   0   0   0   0   0   1   0   
1   0  0  0  0  0  0  0  0  0  1  ...   0   0   0

In [9]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df[['Deskripsi', 'Nominal']], df['Verifikasi'], test_size=0.2, random_state=42)

# Show X_train and y_train before preprocessing
df_train_before_preprocessing = pd.concat([X_train, pd.DataFrame({'Verifikasi': y_train})], axis=1)
print("\nX_train and y_train before preprocessing:")
print(df_train_before_preprocessing.head())

# Save X_train and y_train before preprocessing as CSV
df_train_before_preprocessing.to_csv('X_train_y_train_before_preprocessing.csv', index=False)

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(X_train['Deskripsi'])
X_text_test = vectorizer.transform(X_test['Deskripsi'])

# Combine text features with numerical features
X_train_preprocessed = pd.concat([pd.DataFrame(X_text_train.toarray()), X_train[['Nominal']].astype(str)], axis=1)
X_test_preprocessed = pd.concat([pd.DataFrame(X_text_test.toarray()), X_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train_preprocessed.columns = X_train_preprocessed.columns.astype(str)
X_test_preprocessed.columns = X_test_preprocessed.columns.astype(str)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Prediction
y_pred = rf_model.predict(X_test_preprocessed)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Show X_train and y_train after preprocessing
df_train_after_preprocessing = pd.concat([X_train_preprocessed, pd.DataFrame({'Verifikasi': y_train})], axis=1)
print("\nX_train and y_train after preprocessing:")
print(df_train_after_preprocessing.head())

# Save X_train and y_train after preprocessing as CSV
df_train_after_preprocessing.to_csv('X_train_y_train_after_preprocessing.csv', index=False)


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

X_train and y_train before preprocessing:
                            Deskripsi  Nominal Verifikasi
8                             fiducia  2000000    fidusia
5                  sett      tle ment      300    BUF/BUP
11                         abcabacaca  2100000    fidusia
3                           abcabcabc      250        SHF
18  u     !@#     m    !@#       k  3  3000000       umk3


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [10]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df[['Deskripsi', 'Nominal']], df['Verifikasi'], test_size=0.2, random_state=42)

# Show X_train and y_train before preprocessing
df_train_before_preprocessing = pd.concat([X_train, pd.DataFrame({'Verifikasi': y_train})], axis=1)
print("\nX_train and y_train before preprocessing:")
print(df_train_before_preprocessing.head())

# Save X_train and y_train before preprocessing as CSV
df_train_before_preprocessing.to_csv('X_train_y_train_before_preprocessing.csv', index=False)

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(X_train['Deskripsi'])
X_text_test = vectorizer.transform(X_test['Deskripsi'])

# Combine text features with numerical features
X_train_preprocessed = pd.concat([pd.DataFrame(X_text_train.toarray()), X_train[['Nominal']].astype(str)], axis=1)
X_test_preprocessed = pd.concat([pd.DataFrame(X_text_test.toarray()), X_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train_preprocessed.columns = X_train_preprocessed.columns.astype(str)
X_test_preprocessed.columns = X_test_preprocessed.columns.astype(str)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_preprocessed, y_train)

# Prediction
y_pred = rf_model.predict(X_test_preprocessed)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Show X_train and y_train after preprocessing
df_train_after_preprocessing = pd.concat([X_train_preprocessed, pd.DataFrame({'Verifikasi': y_train})], axis=1)
print("\nX_train and y_train after preprocessing:")
print(df_train_after_preprocessing.head())

# Save X_train and y_train after preprocessing as CSV
df_train_after_preprocessing.to_csv('X_train_y_train_after_preprocessing.csv', index=False)


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

X_train and y_train before preprocessing:
                            Deskripsi  Nominal Verifikasi
8                             fiducia  2000000    fidusia
5                  sett      tle ment      300    BUF/BUP
11                         abcabacaca  2100000    fidusia
3                           abcabcabc      250        SHF
18  u     !@#     m    !@#       k  3  3000000       umk3


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values