In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Read data from CSV
df = pd.read_csv('dummy.csv')

# Identify unique categories
unique_categories = df['Verifikasi'].unique()

# Initialize empty dataframes for training and testing
df_train = pd.DataFrame()
df_test = pd.DataFrame()

# Select one data point per category for testing
for category in unique_categories:
    # Get one data point for testing
    one_data_point = df[df['Verifikasi'] == category].sample(1, random_state=42)
    
    # Add to testing set
    df_test = pd.concat([df_test, one_data_point], ignore_index=True)

# Add the rest to the training set
df_train = df[~df.index.isin(df_test.index)]

# Text Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['Deskripsi'])
X_train_text = tokenizer.texts_to_sequences(df_train['Deskripsi'])
X_test_text = tokenizer.texts_to_sequences(df_test['Deskripsi'])

X_train_text = pad_sequences(X_train_text)
X_test_text = pad_sequences(X_test_text)

# Combine text features with numerical features
X_train_numeric = df_train[['Nominal']].astype(str).reset_index(drop=True)
X_test_numeric = df_test[['Nominal']].astype(str).reset_index(drop=True)

# Convert column names to strings
X_train_numeric.columns = X_train_numeric.columns.astype(str)
X_test_numeric.columns = X_test_numeric.columns.astype(str)

# Merge text and numeric features
X_train = pd.concat([pd.DataFrame(X_train_text), X_train_numeric], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_text), X_test_numeric], axis=1)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Verifikasi'])
y_test = label_encoder.transform(df_test['Verifikasi'])

# Build a simple neural network
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=X_train_text.shape[1]))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(unique_categories), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2, verbose=2)

# Evaluate the model on test data
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'\nAccuracy on Test Data: {accuracy:.2f}')

# Prediction
y_pred = model.predict(X_test)
y_pred_classes = [label.argmax() for label in y_pred]

# Evaluation
print('\nClassification Report on Test Data:')
print(classification_report(y_test, y_pred_classes))


ModuleNotFoundError: No module named 'tensorflow'