In [9]:
#Import necessary libraries

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
import numpy as np


# Contants
vocab_size = 10000
max_length = 100
padding_type = "post"
truncating_type = "post"

In [10]:
# Load dataset



def load_data():
    # TensorFlow Datasets IMDb adathalmaz betöltése
    dataset, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
    train_data, test_data = dataset["train"], dataset["test"]

    train_texts, train_labels = [], []
    for text, label in train_data:
        train_texts.append(text.numpy().decode("utf-8"))
        train_labels.append(label.numpy())

    test_texts, test_labels = [], []
    for text, label in test_data:
        test_texts.append(text.numpy().decode("utf-8"))
        test_labels.append(label.numpy())

    return train_texts, train_labels, test_texts, test_labels


In [11]:
# Preprocess dataset


def clean_text(text):
    # Szövegtisztítás (kisbetűsítés, speciális karakterek eltávolítása, stb.)
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^a-z ]", "")
    return text.numpy().decode("utf-8")

# 2. Adatok előfeldolgozása
def preprocess_data(texts, labels):
    # Szövegtisztítás alkalmazása
    cleaned_texts = [clean_text(tf.constant(text)) for text in texts]

    # Tokenizálás és szekvenciák pad-elése
    tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
    tokenizer.fit_on_texts(cleaned_texts)
    sequences = tokenizer.texts_to_sequences(cleaned_texts)
    padded_sequences = pad_sequences(sequences, maxlen=200, padding="post", truncating="post")

    return padded_sequences, np.array(labels), tokenizer

In [12]:
# Define the sentiment model

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=10000, output_dim=64, input_length=200),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model





In [13]:
# Load dataset

train_texts, train_labels, test_texts, test_labels = load_data()

train_sequences, train_labels, tokenizer = preprocess_data(train_texts, train_labels)
test_sequences, test_labels, _ = preprocess_data(test_texts, test_labels)

X_train, X_val, y_train, y_val = train_test_split(train_sequences, train_labels, test_size=0.2, random_state=42)

In [14]:
# Train model


model = create_model()

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,  # Nagyobb adathalmaz miatt csökkenthetjük az epoch-ok számát
    batch_size=64
)





Epoch 1/5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 36ms/step - accuracy: 0.6230 - loss: 0.6138 - val_accuracy: 0.8504 - val_loss: 0.3491
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 34ms/step - accuracy: 0.8799 - loss: 0.3240 - val_accuracy: 0.8648 - val_loss: 0.3331
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 33ms/step - accuracy: 0.9181 - loss: 0.2403 - val_accuracy: 0.8588 - val_loss: 0.3538
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.9473 - loss: 0.1577 - val_accuracy: 0.8556 - val_loss: 0.4216
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.9553 - loss: 0.1395 - val_accuracy: 0.8438 - val_loss: 0.4992


In [15]:
# Testing and validation


# 6. Tesztelés
loss, accuracy = model.evaluate(test_sequences, test_labels)
print(f"Teszt pontosság: {accuracy * 100:.2f}%")

# Tokenizer mentése későbbi használatra
def save_tokenizer(tokenizer, path="tokenizer.json"):
    with open(path, "w", encoding="utf-8") as f:
        f.write(tokenizer.to_json())

save_tokenizer(tokenizer)

# Modell mentése
model.save("sentiment_model_imdb.h5")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.5216 - loss: 1.7619
Teszt pontosság: 52.36%




In [20]:
#Custom input

import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Tokenizer betöltése
def load_tokenizer(path="tokenizer.json"):
    with open(path, "r", encoding="utf-8") as f:
        tokenizer_data = json.load(f)  # JSON objektum betöltése
    tokenizer_json = json.dumps(tokenizer_data)  # Visszaalakítás stringgé
    return tokenizer_from_json(tokenizer_json)

# Egyedi szöveg tesztelése
def predict_sentiment(text, model, tokenizer):
    def preprocess_single_text(text):
        # Szöveg tisztítása
        text = tf.constant(text)
        text = tf.strings.lower(text)
        text = tf.strings.regex_replace(text, "[^a-z ]", "")
        return text.numpy().decode("utf-8")

    # Szöveg tisztítása és tokenizálása
    cleaned_text = preprocess_single_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=200, padding="post", truncating="post")

    # Előrejelzés
    prediction = model.predict(padded_sequence)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment, prediction

# Modell és tokenizer betöltése
loaded_model = tf.keras.models.load_model("sentiment_model_imdb.h5")
loaded_tokenizer = load_tokenizer()

# Szöveg tesztelése
text_input = "The movie was absolutely terrible. I had just wasted my time."
sentiment, confidence = predict_sentiment(text_input, loaded_model, loaded_tokenizer)

print(f"Input: {text_input}")
print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.2f})")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step
Input: The movie was absolutely terrible. I had just wasted my time.
Predicted Sentiment: Negative (Confidence: 0.04)
