
# Personal Reflection

The original code provided a use case that was too easy so the mask vs unmassked was unidentifiable. So a longer sequence was created so that the unmasked imolementation would fail(be less accurate.)

In [27]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = [
    [1, 2, 3],
    [4, 5, 6, 7, 8],
    [9, 10]
]

pre_padded = pad_sequences(sequences, maxlen=6, padding='pre')
print("Pre-padded:")
print(pre_padded)

post_padded = pad_sequences(sequences, maxlen=6, padding='post')
print("\nPost-padded:")
print(post_padded)


Pre-padded:
[[ 0  0  0  1  2  3]
 [ 0  4  5  6  7  8]
 [ 0  0  0  0  9 10]]

Post-padded:
[[ 1  2  3  0  0  0]
 [ 4  5  6  7  8  0]
 [ 9 10  0  0  0  0]]


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def explain_padding_difference():
    """
    Explain why pre-padding is better for classification.
    
    With pre-padding:
    - Actual content is at the END of the sequence
    - RNN's final hidden state captures actual content
    - Final state is used for classification
    
    With post-padding:
    - Actual content is at the BEGINNING
    - RNN processes padding AFTER content
    - Final state is influenced by padding zeros
    """
    seq = [[1, 2, 3]]
    
    pre = pad_sequences(seq, maxlen=6, padding='pre')
    post = pad_sequences(seq, maxlen=6, padding='post')
    
    print("Original sequence: [1, 2, 3]")
    print(f"Pre-padded:  {pre[0]} -> RNN processes: [0,0,0] then [1,2,3] -> final state from CONTENT")
    print(f"Post-padded: {post[0]} -> RNN processes: [1,2,3] then [0,0,0] -> final state from PADDING")
    
    print("\n--- Visual Timeline ---")
    print("Pre-padding:  [pad][pad][pad][1][2][3] -> Final state = f(3)")
    print("Post-padding: [1][2][3][pad][pad][pad] -> Final state = f(0)")
    
    # Build simple models to show hidden state difference
    rnn = layers.SimpleRNN(4, return_sequences=True, return_state=True)
    
    pre_seq, post_seq = np.array(pre, dtype=np.float32), np.array(post, dtype=np.float32)
    pre_seq = pre_seq.reshape(1, 6, 1)
    post_seq = post_seq.reshape(1, 6, 1)
    
    pre_out, pre_state = rnn(pre_seq)
    post_out, post_state = rnn(post_seq)
    
    print("\n--- Hidden States at Each Timestep ---")
    print("Pre-padded sequence hidden states:")
    for i, h in enumerate(pre_out[0].numpy()):
        print(f"  t={i} (input={pre[0][i]}): {h.round(3)}")
    
    print("\nPost-padded sequence hidden states:")
    for i, h in enumerate(post_out[0].numpy()):
        print(f"  t={i} (input={post[0][i]}): {h.round(3)}")
    
    print(f"\nFinal state (pre):  {pre_state.numpy().round(3)} <- Captures actual content!")
    print(f"Final state (post): {post_state.numpy().round(3)} <- Contaminated by padding!")

explain_padding_difference()


Original sequence: [1, 2, 3]
Pre-padded:  [0 0 0 1 2 3] -> RNN processes: [0,0,0] then [1,2,3] -> final state from CONTENT
Post-padded: [1 2 3 0 0 0] -> RNN processes: [1,2,3] then [0,0,0] -> final state from PADDING

--- Visual Timeline ---
Pre-padding:  [pad][pad][pad][1][2][3] -> Final state = f(3)
Post-padding: [1][2][3][pad][pad][pad] -> Final state = f(0)

--- Hidden States at Each Timestep ---
Pre-padded sequence hidden states:
  t=0 (input=0): [0. 0. 0. 0.]
  t=1 (input=0): [0. 0. 0. 0.]
  t=2 (input=0): [0. 0. 0. 0.]
  t=3 (input=1): [ 0.524 -0.438  0.516 -0.73 ]
  t=4 (input=2): [ 0.812 -0.957  0.532 -0.964]
  t=5 (input=3): [ 0.947 -0.993  0.673 -0.989]

Post-padded sequence hidden states:
  t=0 (input=1): [ 0.524 -0.438  0.516 -0.73 ]
  t=1 (input=2): [ 0.812 -0.957  0.532 -0.964]
  t=2 (input=3): [ 0.947 -0.993  0.673 -0.989]
  t=3 (input=0): [-0.023 -0.902 -0.782  0.12 ]
  t=4 (input=0): [ 0.431 -0.02  -0.073  0.802]
  t=5 (input=0): [-0.21   0.59  -0.488  0.209]

Final s

### Task 1.2: Truncation Strategies
When sequences exceed max_length, you must truncate.


In [29]:
long_sequences = [
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    [11, 12, 13, 14, 15]
]

pre_truncated = pad_sequences(long_sequences, maxlen=5, truncating='pre')
print("Pre-truncated (keep end):")
print(pre_truncated)
print("-> Removes from beginning, keeps last 5 elements")

post_truncated = pad_sequences(long_sequences, maxlen=5, truncating='post')
print("\nPost-truncated (keep beginning):")
print(post_truncated)
print("-> Removes from end, keeps first 5 elements")

print("\n--- When to use each ---")
print("Pre-truncating (keep END): Good for sentiment where final words matter most")
print("Post-truncating (keep BEGIN): Good for documents where intro is key")


Pre-truncated (keep end):
[[ 6  7  8  9 10]
 [11 12 13 14 15]]
-> Removes from beginning, keeps last 5 elements

Post-truncated (keep beginning):
[[ 1  2  3  4  5]
 [11 12 13 14 15]]
-> Removes from end, keeps first 5 elements

--- When to use each ---
Pre-truncating (keep END): Good for sentiment where final words matter most
Post-truncating (keep BEGIN): Good for documents where intro is key


## Part 3: Complete Pipeline Class
### Task 3.1: Build Production Pipeline
Create a reusable preprocessing class that handles tokenization, vocabulary building, padding, and train/test consistency.


In [30]:
class SequencePreprocessor:
    """
    Complete preprocessing pipeline for sequence data.
    
    Handles:
    - Tokenization
    - Vocabulary building with size limits
    - Padding and truncation
    - Train/test consistency
    """
    
    def __init__(self, max_vocab_size=10000, max_length=None, 
                 padding='pre', truncating='pre'):
        self.max_vocab_size = max_vocab_size
        self.max_length = max_length
        self.padding = padding
        self.truncating = truncating
        self.tokenizer = None
        self.fitted = False
    
    def fit(self, texts):
        """Fit tokenizer on training texts."""
        from tensorflow.keras.preprocessing.text import Tokenizer
        
        self.tokenizer = Tokenizer(
            num_words=self.max_vocab_size,
            oov_token='<UNK>'
        )
        self.tokenizer.fit_on_texts(texts)
        
        # If max_length not set, use 95th percentile
        if self.max_length is None:
            sequences = self.tokenizer.texts_to_sequences(texts)
            lengths = [len(s) for s in sequences]
            self.max_length = int(np.percentile(lengths, 95))
        
        self.fitted = True
        return self
    
    def transform(self, texts):
        """Transform texts to padded sequences."""
        if not self.fitted:
            raise ValueError("Preprocessor not fitted. Call fit() first.")
        
        sequences = self.tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(
            sequences,
            maxlen=self.max_length,
            padding=self.padding,
            truncating=self.truncating
        )
        return padded
    
    def fit_transform(self, texts):
        """Fit and transform in one step."""
        return self.fit(texts).transform(texts)
    
    @property
    def vocab_size(self):
        """Return vocabulary size."""
        if self.tokenizer is None:
            return 0
        return min(len(self.tokenizer.word_index) + 1, self.max_vocab_size)
    
    def get_config(self):
        """Return configuration for model building."""
        return {
            'vocab_size': self.vocab_size,
            'max_length': self.max_length,
            'embedding_dim': 128
        }

print("SequencePreprocessor class defined!")


SequencePreprocessor class defined!


### Task 3.2: Test Complete Pipeline
Use the preprocessor on sample movie reviews and build a model from the config.


In [31]:
# Sample movie reviews
train_texts = [
    "This movie was absolutely fantastic! Great acting.",
    "Terrible film. Complete waste of time.",
    "An average movie, nothing special.",
    "One of the best I've seen in years!",
    "Boring and predictable storyline.",
    "A masterpiece of modern cinema.",
    "Couldn't finish it, too slow and dull.",
    "Incredible performances by the entire cast.",
    "Generic plot with no surprises.",
    "Visually stunning and emotionally powerful."
]

train_labels = np.array([1, 0, 0, 1, 0, 1, 0, 1, 0, 1])

# Test pipeline
preprocessor = SequencePreprocessor(max_vocab_size=1000)
X_train = preprocessor.fit_transform(train_texts)

print(f"Config: {preprocessor.get_config()}")
print(f"Processed shape: {X_train.shape}")
print(f"\nFirst review: '{train_texts[0]}'")
print(f"Tokenized & padded: {X_train[0]}")


Config: {'vocab_size': 54, 'max_length': 7, 'embedding_dim': 128}
Processed shape: (10, 7)

First review: 'This movie was absolutely fantastic! Great acting.'
Tokenized & padded: [ 6  4  7  8  9 10 11]


In [32]:
# Build model using config from preprocessor
config = preprocessor.get_config()

model = keras.Sequential([
    layers.Embedding(config['vocab_size'], config['embedding_dim'], 
                     input_length=config['max_length'], mask_zero=True),
    layers.LSTM(64),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Quick training demo
print("\nTraining on sample data...")
model.fit(X_train, train_labels, epochs=10, verbose=0)
print(f"Training accuracy: {model.evaluate(X_train, train_labels, verbose=0)[1]:.2f}")



Training on sample data...
Training accuracy: 1.00


In [33]:
# Test transform on NEW data (simulating test set)
test_texts = [
    "Amazing film, highly recommend!",
    "Worst movie ever made.",
    "Great special effects but weak story."
]

X_test = preprocessor.transform(test_texts)  # Uses SAME tokenizer/config as training
print("Transform on new test data:")
print(f"Test shape: {X_test.shape}")
print(f"\nPredictions on test data:")
for text, pred in zip(test_texts, model.predict(X_test, verbose=0)):
    sentiment = "Positive" if pred > 0.5 else "Negative"
    print(f"  '{text[:40]}...' -> {sentiment} ({pred[0]:.2f})")


Transform on new test data:
Test shape: (3, 7)

Predictions on test data:
  'Amazing film, highly recommend!...' -> Positive (0.51)
  'Worst movie ever made....' -> Positive (0.51)
  'Great special effects but weak story....' -> Positive (0.53)


## Reflection Questions

**1. Why is pre-padding preferred for classification?**

The LSTM's final  state comes from the last step - with pre-padding that's actual content, with post-padding it's zeros.

**2. When would post-padding be better?**

tasks where you need the decoder to start from the actual beginning of the sequence.

**3. What happens if you don't use masking with heavily padded data?**

Model still learns but wastes capacity processing zeros and performance might drops if the padding ratio is high.

**4. How would you handle a dataset where 5% of sequences are 10x longer than average?**

Truncate to ~95th percentile length because the logner seqences are probably exceptions and liely not a greate reperestation of the rest of the data(given that they are outliers) 


