In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from fuzzywuzzy import process
import re
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Dense, Normalization
import matplotlib.pyplot as plt

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text.strip())

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

def closest_word(text, dataset_texts):
    highest_similarity_score = 0
    closest_match = None
    for word in text.split():
        closest_match_for_word, similarity_score = process.extractOne(word, dataset_texts)
        if similarity_score > highest_similarity_score:
            highest_similarity_score = similarity_score
            closest_match = closest_match_for_word
    return closest_match, highest_similarity_score

def preprocess(df, dataset_texts):
    df['Deskripsi'] = df['Deskripsi'].apply(remove_extra_spaces)
    df['Deskripsi'] = df['Deskripsi'].apply(normalize_text)
    
    closest_words = []
    closest_words_num = []
    closest_words_score = []
    for sentence in df['Deskripsi']:
        closest_match, similarity_score = closest_word(sentence, dataset_texts)
        if similarity_score < 20:
            closest_word_num = 0
            closest_match = "missing"
        else:
            closest_word_num = dataset_texts.index(closest_match) + 1
        closest_words.append(closest_match)
        closest_words_num.append(closest_word_num)
        closest_words_score.append(similarity_score)
    
    df['closest_words'] = closest_words
    df['closest_words_num'] = closest_words_num
    df['score'] = closest_words_score
    
    return df[['closest_words_num']].values, df['Verifikasi'].values

# Load the DataFrame
df = pd.read_csv('dummy2.csv')

# Example dataset_texts
dataset_texts = ["shf", "settlement", "fidusia", "pinalty", "umk"]

# Preprocess the data
X, y = preprocess(df, dataset_texts)

# Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert DataFrame to tensors
train_features = tf.convert_to_tensor(train_features, dtype=tf.float32)
test_features = tf.convert_to_tensor(test_features, dtype=tf.float32)

# Normalize the features
normalizer = Normalization(axis=-1)
normalizer.adapt(train_features)

# Build and compile the model
def build_and_compile_model(norm):
    model = tf.keras.Sequential([
        norm,
        Dense(64, activation='relu'),
        Dense(64, activation='relu'),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(0.001),
                  metrics=['accuracy'])
    return model

# Build the model
model = build_and_compile_model(normalizer)

# Train the model
model.fit(train_features, train_labels, epochs=500, batch_size=32, validation_split=0.3)

# Predict on test data
test_predictions = np.argmax(model.predict(test_features), axis=-1)

# Decode the true labels
y_test_decoded = label_encoder.inverse_transform(test_labels)

# Decode the predicted labels
y_pred_decoded = label_encoder.inverse_transform(test_predictions)

# Print classification report
print(classification_report(y_test_decoded, y_pred_decoded))

# Visualize the predictions
plt.figure(figsize=(8, 8))
plt.scatter(y_test_decoded, y_pred_decoded)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.show()


2024-04-12 15:01:33.691655: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 15:01:33.739876: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 15:01:33.739918: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 15:01:33.741139: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 15:01:33.748357: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 15:01:33.749569: I tensorflow/core/platform/cpu_feature_guard.cc:1

ValueError: The last dimension of the inputs to a Dense layer should be defined. Found None. Full input shape received: (None, None)