In [20]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]])

# Randomize the test data when splitting
df_train, df_test = train_test_split(df, test_size=0.25, shuffle=True, random_state=42, stratify=df['Verifikasi'])

# Display original indices of y_test before splitting
print("\ny_test original indices:")
print(df_test.index)
print('\n')

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Handle missing values in numerical features
X_train = X_train.fillna(0)  # Replace NaN with 0 for simplicity
X_test = X_test.fillna(0)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, df_train['Verifikasi'])

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(df_test['Verifikasi'], y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(df_test['Verifikasi'], y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

y_test original indices:
Int64Index([10, 6, 19, 2, 14], dtype='int64')




ValueError: Found input variables with inconsistent numbers of samples: [19, 15]

In [4]:
print(X_train)

      0    1    2    3    4    5    6    7    8    9   10   11   12   13   14  \
0   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0   
1   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2   0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0   
3   0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
4   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0   
5   1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
6   0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0   
7   0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
8   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
9   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0   
10  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
11  0.0  0.0  0.0  0.0  0.0 

In [5]:
print(df_train)

                            Deskripsi  Nominal Verifikasi
12                            pinalti  1000000    pinalty
18  u     !@#     m    !@#       k  3  3000000       umk3
5                  sett      tle ment      300    BUF/BUP
9                             fiducia  2001000    fidusia
13                            pinalty  2000000    pinalty
11                         abcabacaca  2100000    fidusia
15                          pi nal ty  3000000    pinalty
1                 Pembayaran S    H F      200        SHF
17                              umk 3  2000000       umk3
7                           setelment  1000000    BUF/BUP
3                           abcabcabc      250        SHF
0                      Pembayaran SHF      200        SHF
4         settlement cabang jabotabek      200    BUF/BUP
16                              umk 3  1000000       umk3
8                             fiducia  2000000    fidusia


In [6]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Text Preprocessing
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(df['Deskripsi'])

# Combine text features with numerical features
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), df[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_numerical.columns = X_numerical.columns.astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_numerical, df['Verifikasi'], test_size=0.25, random_state=42, stratify=df['Verifikasi'])

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP
Accuracy: 0.80

Classification Report:
              precision    recall  f1-score   support

     BUF/BUP       0.00      0.00      0.00         1
         SHF       0.50      1.00      0.67         1
     fidusia       1.00      1.00      1.00         1
     pinalty       1.00      1.00      1.00         1
        umk3       1.00      1.00      1.00         1

    accuracy                           0.80         5
   macro avg       0.70      0.80      0.73         5
weighted avg       0.70      0.80      0.73         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one random data point per category for testing
for category in df['Verifikasi'].unique():
    # Get one random data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]])

# Display the test data
print("\nTest Data:")
print(df_test)

# Display the training data
print("\nTraining Data:")
print(df_train)

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Handle missing values in numerical features
X_train = X_train.fillna(0)  # Replace NaN with 0 for simplicity
X_test = X_test.fillna(0)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, df_train['Verifikasi'])

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(df_test['Verifikasi'], y_pred)
print(f'\nAccuracy on Test Data: {accuracy:.2f}')

print('\nClassification Report on Test Data:')
print(classification_report(df_test['Verifikasi'], y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

Test Data:
             Deskripsi  Nominal Verifikasi
0  Pembayaran S    H F      200        SHF
1   sett      tle ment      300    BUF/BUP
2              fiducia  2001000    fidusia
3              pinalty  2000000    pinalty
4                umk 3  2000000       umk3

Training Data:
                      Deskripsi  Nominal Verifikasi
4   settlement cabang jabotabek      200    BUF/BUP
5            sett      tle ment      300    BUF/BUP
6                     setlement      500    BUF/BUP
7                     setelment  1000000    BUF/BUP
8               

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one random data point per category for testing
for category in df['Verifikasi'].unique():
    # Get one random data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=8)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)
    
    # Add the rest to the training set
    df_train = pd.concat([df_train, df[df['Verifikasi'] != category]])

# Display the test data
print("\nTest Data:")
print(df_test)

# Display the training data
print("\nTraining Data:")
print(df_train)

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str).reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str).reset_index(drop=True)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Handle missing values in numerical features
X_train = X_train.fillna(0)  # Replace NaN with 0 for simplicity
X_test = X_test.fillna(0)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, df_train['Verifikasi'])

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(df_test['Verifikasi'], y_pred)
print(f'\nAccuracy on Test Data: {accuracy:.2f}')

print('\nClassification Report on Test Data:')
print(classification_report(df_test['Verifikasi'], y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

Test Data:
                                           Deskripsi  Nominal Verifikasi
0  Pembyaran S                                   ...      210        SHF
1                                          setlement      500    BUF/BUP
2                                         abcabacaca  2000000    fidusia
3                                          pi nal ty  3000000    pinalty
4                  u     !@#     m    !@#       k  3  3000000       umk3

Training Data:
                      Deskripsi  Nominal Verifikasi
4   settlement cabang jabotabek      200    

In [14]:
print(len(df_train["Deskripsi"]))

80


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Display the first few rows of the dataframe
print(df.head())

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one random data point per category for testing
for category in df['Verifikasi'].unique():
    # Get all data points for the current category
    category_data = df[df['Verifikasi'] == category]
    
    # Split the data for testing and training
    category_train, category_test = train_test_split(category_data, test_size=0.25, random_state=42)
    
    # Add to training set
    df_train = pd.concat([df_train, category_train])
    
    # Add to testing set
    df_test = pd.concat([df_test, category_test])

# Display the test data
print("\nTest Data:")
print(df_test)

# Display the training data
print("\nTraining Data:")
print(df_train)

# Text Preprocessing
vectorizer = CountVectorizer()
X_text_train = vectorizer.fit_transform(df_train['Deskripsi'])
X_text_test = vectorizer.transform(df_test['Deskripsi'])

# Combine text features with numerical features
X_train = pd.concat([pd.DataFrame(X_text_train.toarray()), df_train[['Nominal']].astype(str).reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(X_text_test.toarray()), df_test[['Nominal']].astype(str).reset_index(drop=True)], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Handle missing values in numerical features
X_train = X_train.fillna(0)  # Replace NaN with 0 for simplicity
X_test = X_test.fillna(0)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, df_train['Verifikasi'])

# Prediction
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(df_test['Verifikasi'], y_pred)
print(f'\nAccuracy on Test Data: {accuracy:.2f}')

print('\nClassification Report on Test Data:')
print(classification_report(df_test['Verifikasi'], y_pred))


                                           Deskripsi  Nominal Verifikasi
0                                     Pembayaran SHF      200        SHF
1                                Pembayaran S    H F      200        SHF
2  Pembyaran S                                   ...      210        SHF
3                                          abcabcabc      250        SHF
4                        settlement cabang jabotabek      200    BUF/BUP

Test Data:
              Deskripsi  Nominal Verifikasi
1   Pembayaran S    H F      200        SHF
5    sett      tle ment      300    BUF/BUP
9               fiducia  2001000    fidusia
13              pinalty  2000000    pinalty
17                umk 3  2000000       umk3

Training Data:
                                            Deskripsi  Nominal Verifikasi
3                                           abcabcabc      250        SHF
0                                      Pembayaran SHF      200        SHF
2   Pembyaran S                                 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(len(df_train["Deskripsi"]))

15
