In [None]:
%pip install torch sklearn transformers pandas

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

In [None]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased', cache_dir='./bert')

In [None]:
df = pd.read_csv('final.csv')
df.head()

In [None]:
unique_tags = df['tag'].nunique()
unique_tags

In [None]:
# Function to encode text using BERT with padding
def encode_text(texts, max_length=128):
    inputs = tokenizer(texts, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = bert(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [None]:
encoded_patterns = encode_text(df['pattern'].tolist()).numpy()
df['encoded_pattern'] = list(encoded_patterns)

In [None]:
df.head()

In [None]:
# Encode the labels
label_encoder = LabelEncoder()
df['encoded_tag'] = label_encoder.fit_transform(df['tag'])

In [None]:
df.tail()

In [None]:
# Prepare the dataset
X = torch.tensor(df['encoded_pattern'].tolist(), dtype=torch.float32)
y = torch.tensor(df['encoded_tag'].tolist(), dtype=torch.long)

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the improved neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
num_classes = len(df['tag'].unique())
model = SimpleNN(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [None]:
# Train the improved model
batch_size = 8
train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [None]:
for epoch in range(512):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/ 256], Loss: {avg_loss:.4f}')

In [None]:
# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = (predicted == y_test).sum().item() / y_test.size(0)
    print(f'Accuracy: {accuracy:.4f}')

In [None]:
def predict_tag(pattern):
    encoded_pattern = encode_text([pattern]).squeeze(0)  # Encode and remove batch dimension
    improved_model.eval()
    with torch.no_grad():
        output = improved_model(encoded_pattern.unsqueeze(0))  # Add batch dimension back
        _, predicted = torch.max(output, 1)
        confidence = torch.softmax(output, dim=1).max().item()
        predicted_tag = label_encoder.inverse_transform(predicted.numpy())[0]
    return predicted_tag, confidence

In [None]:
# Example prediction
user_pattern = "diagnosis stunting seperti apa?"
predicted_tag, confidence = predict_tag(user_pattern)
print(f'Predicted Tag: {predicted_tag}, Confidence: {confidence:.4f}')

In [None]:
# Save the model
model_path = 'improved_model.pth'
torch.save(improved_model.state_dict(), model_path)

In [None]:
X_train.shape[1]