# Advanced RNN Lyrics Generator with RAG System
## Generative AI Model for Ariana Grande Lyrics

This notebook implements an advanced RNN neural network with:
- **5 hidden layers** with LSTM architecture
- **5 activation functions**: ReLU, Tanh, ELU, SELU, Sigmoid
- **1 loss activation function**: Softmax
- **RAG (Retrieval-Augmented Generation)** system for context-aware generation
- **MongoDB integration** for embeddings storage
- **TensorFlow/Keras** for deep learning
- **PySpark** for big data processing
- **scikit-learn** for ML utilities
- **NumPy, pandas** for data manipulation
- **pickle** for model persistence


In [None]:
# Import all required libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import pymongo
from pymongo import MongoClient
import pickle
import json

import re
import string
import random
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

print("✅ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")


## 1. Data Loading and Preprocessing


In [None]:
# Load Ariana Grande lyrics dataset
df = pd.read_csv('ArianaGrande.csv')

print(f"📊 Dataset shape: {df.shape}")
print(f"📋 Columns: {df.columns.tolist()}")
print("\n🔍 First few rows:")
print(df.head())

# Basic statistics
print("\n📈 Dataset Statistics:")
print(f"Total songs: {df['Title'].nunique()}")
print(f"Total albums: {df['Album'].nunique()}")
print(f"Year range: {df['Year'].min()} - {df['Year'].max()}")
print(f"Average lyrics length: {df['Lyric'].str.len().mean():.0f} characters")

# Display sample lyrics
print("\n🎵 Sample Lyrics:")
for i, lyric in enumerate(df['Lyric'].head(3)):
    print(f"{i+1}. {lyric[:100]}...")
    print()


In [None]:
class LyricsPreprocessor:
    def __init__(self):
        self.tokenizer = None
        self.vocab_size = 0
        self.max_length = 0
        
    def clean_text(self, text):
        """Clean and normalize lyrics text"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s\.,!?;:\'"-]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Add start and end tokens
        text = f"<start> {text} <end>"
        
        return text
    
    def prepare_data(self, lyrics_list, max_vocab_size=15000, max_length=150):
        """Prepare data for training"""
        print("🧹 Cleaning lyrics data...")
        
        # Clean all lyrics
        cleaned_lyrics = [self.clean_text(lyric) for lyric in lyrics_list]
        
        # Initialize tokenizer
        self.tokenizer = Tokenizer(
            num_words=max_vocab_size,
            filters='',
            oov_token='<oov>'
        )
        
        # Fit tokenizer on cleaned lyrics
        self.tokenizer.fit_on_texts(cleaned_lyrics)
        
        # Convert texts to sequences
        sequences = self.tokenizer.texts_to_sequences(cleaned_lyrics)
        
        # Pad sequences
        padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
        
        self.vocab_size = len(self.tokenizer.word_index) + 1
        self.max_length = max_length
        
        print(f"✅ Vocabulary size: {self.vocab_size}")
        print(f"✅ Max sequence length: {self.max_length}")
        print(f"✅ Total sequences: {len(sequences)}")
        
        return padded_sequences, cleaned_lyrics
    
    def create_training_data(self, sequences):
        """Create input-output pairs for training"""
        print("🔄 Creating training sequences...")
        X, y = [], []
        
        for sequence in sequences:
            for i in range(1, len(sequence)):
                if sequence[i] != 0:  # Skip padding tokens
                    X.append(sequence[:i])
                    y.append(sequence[i])
        
        # Pad input sequences
        X = pad_sequences(X, maxlen=self.max_length-1, padding='post')
        
        print(f"✅ Training samples: {len(X)}")
        print(f"✅ Input shape: {X.shape}")
        print(f"✅ Output shape: {y.shape}")
        
        return np.array(X), np.array(y)

# Initialize preprocessor
preprocessor = LyricsPreprocessor()

# Prepare data
sequences, cleaned_lyrics = preprocessor.prepare_data(
    df['Lyric'].tolist(),
    max_vocab_size=15000,
    max_length=150
)

# Create training data
X, y = preprocessor.create_training_data(sequences)


## 2. Advanced RNN Architecture with 5 Hidden Layers and 5 Activation Functions


In [None]:
class AdvancedRNNGenerator:
    def __init__(self, vocab_size, embedding_dim=256, hidden_units=512, max_length=150):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.max_length = max_length
        self.model = None
        
    def build_model(self):
        """Build advanced RNN model with 5 hidden layers and multiple activation functions"""
        print("🏗️ Building Advanced RNN Architecture...")
        
        model = models.Sequential()
        
        # Input layer - Embedding
        model.add(layers.Embedding(
            input_dim=self.vocab_size,
            output_dim=self.embedding_dim,
            input_length=self.max_length-1,
            name='embedding_layer'
        ))
        
        # Hidden Layer 1: LSTM with ReLU activation
        model.add(layers.LSTM(
            units=self.hidden_units,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2,
            name='lstm_layer_1'
        ))
        model.add(layers.Activation('relu', name='activation_relu'))
        
        # Hidden Layer 2: LSTM with Tanh activation
        model.add(layers.LSTM(
            units=self.hidden_units,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2,
            name='lstm_layer_2'
        ))
        model.add(layers.Activation('tanh', name='activation_tanh'))
        
        # Hidden Layer 3: LSTM with ELU activation
        model.add(layers.LSTM(
            units=self.hidden_units,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2,
            name='lstm_layer_3'
        ))
        model.add(layers.Activation('elu', name='activation_elu'))
        
        # Hidden Layer 4: LSTM with SELU activation
        model.add(layers.LSTM(
            units=self.hidden_units,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2,
            name='lstm_layer_4'
        ))
        model.add(layers.Activation('selu', name='activation_selu'))
        
        # Hidden Layer 5: LSTM with Sigmoid activation
        model.add(layers.LSTM(
            units=self.hidden_units,
            return_sequences=False,
            dropout=0.2,
            recurrent_dropout=0.2,
            name='lstm_layer_5'
        ))
        model.add(layers.Activation('sigmoid', name='activation_sigmoid'))
        
        # Dense layer for output
        model.add(layers.Dense(
            units=self.hidden_units,
            activation='relu',
            name='dense_layer_1'
        ))
        
        # Output layer with softmax activation (loss function)
        model.add(layers.Dense(
            units=self.vocab_size,
            activation='softmax',
            name='output_layer'
        ))
        
        # Compile model with advanced optimizer
        optimizer = optimizers.Adam(
            learning_rate=0.001,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-07
        )
        
        model.compile(
            optimizer=optimizer,
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        self.model = model
        print("✅ Advanced RNN Model Built Successfully!")
        return model
    
    def get_model_summary(self):
        """Get model architecture summary"""
        if self.model:
            return self.model.summary()
        return None

# Initialize the advanced RNN generator
rnn_generator = AdvancedRNNGenerator(
    vocab_size=preprocessor.vocab_size,
    embedding_dim=256,
    hidden_units=512,
    max_length=preprocessor.max_length
)

# Build the model
model = rnn_generator.build_model()

print("\n📋 Model Architecture Summary:")
print(rnn_generator.get_model_summary())
