In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
import pickle
import os

ModuleNotFoundError: No module named 'tensorflow'

In [6]:
!pip install numpy pandas tensorflow matplotlib



ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)
ERROR: No matching distribution found for tensorflow


In [None]:
# Set random seed untuk reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
# 2. Download dan Load Dataset
# Kita akan menggunakan dataset dari beberapa sumber
# Untuk tutorial ini, kita akan membuat fungsi untuk download dataset

def download_poetry_dataset():
    """
    Fungsi untuk download dataset puisi dari berbagai sumber
    Dataset yang direkomendasikan:
    1. Kaggle: Poems Dataset (NLP) - https://www.kaggle.com/datasets/michaelarman/poemsdataset
    2. Kaggle: Gutenberg Poetry Dataset - https://www.kaggle.com/datasets/terminate9298/gutenberg-poetry-dataset
    3. Kaggle: Poetry Foundation Poems - https://www.kaggle.com/datasets/tgdivy/poetry-foundation-poems
    """
    print("Silakan download salah satu dataset berikut:")
    print("1. Kaggle: Poems Dataset (NLP) - https://www.kaggle.com/datasets/michaelarman/poemsdataset")
    print("2. Kaggle: Gutenberg Poetry Dataset - https://www.kaggle.com/datasets/terminate9298/gutenberg-poetry-dataset")
    print("3. Kaggle: Poetry Foundation Poems - https://www.kaggle.com/datasets/tgdivy/poetry-foundation-poems")
    print("4. Kaggle: 100-poems dataset - https://www.kaggle.com/datasets/imbikramsaha/poems")
    
    # Untuk demo, kita akan menggunakan sample data
    sample_poems = [
        "Roses are red, violets are blue, sugar is sweet, and so are you",
        "The sun sets in the west, painting the sky with golden hues",
        "In the quiet of the night, stars whisper ancient secrets",
        "Mountains stand tall and proud, reaching for the endless sky",
        "Ocean waves dance and play, singing songs of distant lands",
        "Time flows like a river, carrying memories downstream",
        "Love is a gentle breeze, touching hearts with tender care",
        "Dreams take flight on wings of hope, soaring beyond reality",
        "Flowers bloom in springtime, bringing joy to weary souls",
        "Moonlight guides the lonely traveler through the darkest path"
    ]
    
    return sample_poems


In [None]:
# 3. Preprocessing Data
class TextPreprocessor:
    def __init__(self):
        self.tokenizer = None
        self.vocab_size = 0
        self.max_sequence_length = 0
        
    def clean_text(self, text):
        """Membersihkan teks dari karakter yang tidak diinginkan"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove numbers (optional)
        text = re.sub(r'\d+', '', text)
        
        # Keep only letters, spaces, and basic punctuation
        text = re.sub(r'[^a-zA-Z\s.,!?;:\'\-]', '', text)
        
        return text.strip()
    
    def prepare_sequences(self, texts, vocab_size=5000, max_length=50):
        """Mempersiapkan sekuens untuk training"""
        # Clean texts
        cleaned_texts = [self.clean_text(text) for text in texts]
        
        # Tokenize
        self.tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
        self.tokenizer.fit_on_texts(cleaned_texts)
        
        # Convert to sequences
        sequences = self.tokenizer.texts_to_sequences(cleaned_texts)
        
        # Create training sequences
        input_sequences = []
        for sequence in sequences:
            for i in range(1, len(sequence)):
                input_sequences.append(sequence[:i+1])
        
        # Pad sequences
        self.max_sequence_length = max_length
        input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre')
        
        # Create X and y
        X = input_sequences[:, :-1]
        y = input_sequences[:, -1]
        
        # Convert y to categorical
        self.vocab_size = len(self.tokenizer.word_index) + 1
        y = to_categorical(y, num_classes=self.vocab_size)
        
        return X, y
    
    def save_tokenizer(self, filepath):
        """Menyimpan tokenizer"""
        with open(filepath, 'wb') as f:
            pickle.dump(self.tokenizer, f)
    
    def load_tokenizer(self, filepath):
        """Memuat tokenizer"""
        with open(filepath, 'rb') as f:
            self.tokenizer = pickle.load(f)

In [None]:
# 4. Model LSTM
class LSTMLyricGenerator:
    def __init__(self, vocab_size, max_sequence_length, embedding_dim=100):
        self.vocab_size = vocab_size
        self.max_sequence_length = max_sequence_length
        self.embedding_dim = embedding_dim
        self.model = None
        
    def build_model(self):
        """Membangun model LSTM"""
        self.model = Sequential([
            Embedding(self.vocab_size, self.embedding_dim, input_length=self.max_sequence_length-1),
            LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            LSTM(128, dropout=0.2, recurrent_dropout=0.2),
            Dense(self.vocab_size, activation='softmax')
        ])
        
        self.model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return self.model
    
    def train_model(self, X, y, epochs=50, batch_size=32, validation_split=0.2):
        """Melatih model"""
        # Callbacks
        early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
        
        # Training
        history = self.model.fit(
            X, y,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            callbacks=[early_stop, checkpoint],
            verbose=1
        )
        
        return history
    
    def generate_text(self, tokenizer, seed_text, num_words=20, temperature=1.0):
        """Generate text menggunakan model yang sudah dilatih"""
        result = seed_text
        
        for _ in range(num_words):
            # Tokenize current text
            token_list = tokenizer.texts_to_sequences([result])[0]
            token_list = pad_sequences([token_list], maxlen=self.max_sequence_length-1, padding='pre')
            
            # Predict next word
            predicted_probs = self.model.predict(token_list, verbose=0)[0]
            
            # Apply temperature for creativity
            predicted_probs = np.log(predicted_probs + 1e-8) / temperature
            predicted_probs = np.exp(predicted_probs)
            predicted_probs = predicted_probs / np.sum(predicted_probs)
            
            # Sample from distribution
            predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
            
            # Convert back to word
            output_word = ""
            for word, index in tokenizer.word_index.items():
                if index == predicted_index:
                    output_word = word
                    break
            
            if output_word:
                result += " " + output_word
        
        return result

In [None]:
# 5. Fungsi Utama untuk Training
def main_training():
    print("=== LSTM Lyric Generator Training ===")
    
    # Load data
    print("Loading dataset...")
    texts = download_poetry_dataset()
    
    # Preprocessing
    print("Preprocessing data...")
    preprocessor = TextPreprocessor()
    X, y = preprocessor.prepare_sequences(texts, vocab_size=5000, max_length=30)
    
    print(f"Vocabulary size: {preprocessor.vocab_size}")
    print(f"Sequence length: {preprocessor.max_sequence_length}")
    print(f"Training samples: {len(X)}")
    
    # Build model
    print("Building model...")
    generator = LSTMLyricGenerator(
        vocab_size=preprocessor.vocab_size,
        max_sequence_length=preprocessor.max_sequence_length,
        embedding_dim=100
    )
    
    model = generator.build_model()
    print(model.summary())
    
    # Train model
    print("Training model...")
    history = generator.train_model(X, y, epochs=100, batch_size=16)
    
    # Save model and tokenizer
    model.save('lstm_lyric_generator_model.h5')
    preprocessor.save_tokenizer('tokenizer.pkl')
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    return generator, preprocessor

In [None]:
# 6. Fungsi untuk Generate Lirik
def generate_lyrics(seed_text="the sun", num_words=30, temperature=0.8):
    """Generate lirik/puisi baru"""
    try:
        # Load model dan tokenizer
        model = tf.keras.models.load_model('lstm_lyric_generator_model.h5')
        
        preprocessor = TextPreprocessor()
        preprocessor.load_tokenizer('tokenizer.pkl')
        
        # Create generator instance
        generator = LSTMLyricGenerator(
            vocab_size=len(preprocessor.tokenizer.word_index) + 1,
            max_sequence_length=30
        )
        generator.model = model
        
        # Generate text
        generated_text = generator.generate_text(
            preprocessor.tokenizer, 
            seed_text, 
            num_words=num_words, 
            temperature=temperature
        )
        
        return generated_text
        
    except Exception as e:
        print(f"Error: {e}")
        print("Pastikan model sudah dilatih terlebih dahulu dengan menjalankan main_training()")
        return None


In [None]:
# 7. Fungsi untuk Evaluasi Model
def evaluate_model():
    """Evaluasi model dengan berbagai seed text"""
    seed_texts = [
        "love is",
        "the moon",
        "in the night",
        "flowers bloom",
        "time flows"
    ]
    
    temperatures = [0.5, 0.8, 1.0, 1.2]
    
    print("=== Generated Lyrics/Poetry ===")
    
    for seed in seed_texts:
        print(f"\nSeed: '{seed}'")
        print("-" * 50)
        
        for temp in temperatures:
            generated = generate_lyrics(seed, num_words=25, temperature=temp)
            if generated:
                print(f"Temperature {temp}: {generated}")
        print()

# 8. Fungsi Helper untuk Analisis
def analyze_dataset(texts):
    """Analisis dataset"""
    print("=== Dataset Analysis ===")
    
    # Basic statistics
    total_texts = len(texts)
    total_words = sum(len(text.split()) for text in texts)
    avg_words = total_words / total_texts
    
    print(f"Total texts: {total_texts}")
    print(f"Total words: {total_words}")
    print(f"Average words per text: {avg_words:.2f}")
    
    # Word frequency
    all_words = " ".join(texts).lower().split()
    word_freq = Counter(all_words)
    
    print(f"Unique words: {len(word_freq)}")
    print(f"Most common words: {word_freq.most_common(10)}")
    
    return word_freq

# 7. Fungsi untuk Evaluasi Model
def evaluate_model():
    """Evaluasi model dengan berbagai seed text"""
    seed_texts = [
        "love is",
        "the moon",
        "in the night",
        "flowers bloom",
        "time flows"
    ]
    
    temperatures = [0.5, 0.8, 1.0, 1.2]
    
    print("=== Generated Lyrics/Poetry ===")
    
    for seed in seed_texts:
        print(f"\nSeed: '{seed}'")
        print("-" * 50)
        
        for temp in temperatures:
            generated = generate_lyrics(seed, num_words=25, temperature=temp)
            if generated:
                print(f"Temperature {temp}: {generated}")
        print()

# 8. Fungsi Helper untuk Analisis
def analyze_dataset(texts):
    """Analisis dataset"""
    print("=== Dataset Analysis ===")
    
    # Basic statistics
    total_texts = len(texts)
    total_words = sum(len(text.split()) for text in texts)
    avg_words = total_words / total_texts
    
    print(f"Total texts: {total_texts}")
    print(f"Total words: {total_words}")
    print(f"Average words per text: {avg_words:.2f}")
    
    # Word frequency
    all_words = " ".join(texts).lower().split()
    word_freq = Counter(all_words)
    
    print(f"Unique words: {len(word_freq)}")
    print(f"Most common words: {word_freq.most_common(10)}")
    
    return word_freq

In [None]:
# 9. Contoh Penggunaan
if __name__ == "__main__":
    print("LSTM Lyric/Poetry Generator")
    print("=" * 50)
    
    # Pilihan menu
    print("1. Train new model")
    print("2. Generate lyrics (requires trained model)")
    print("3. Evaluate model")
    print("4. Analyze dataset")
    
    choice = input("Choose option (1-4): ")
    
    if choice == "1":
        generator, preprocessor = main_training()
        print("Training completed!")
        
    elif choice == "2":
        seed = input("Enter seed text (or press Enter for default): ") or "the sun"
        num_words = int(input("Number of words to generate (default 30): ") or 30)
        temperature = float(input("Temperature (0.5-2.0, default 0.8): ") or 0.8)
        
        result = generate_lyrics(seed, num_words, temperature)
        if result:
            print(f"\nGenerated text: {result}")
        
    elif choice == "3":
        evaluate_model()
        
    elif choice == "4":
        texts = download_poetry_dataset()
        analyze_dataset(texts)
        
    else:
        print("Invalid choice!")