4. Programming Task (Data Poisoning Simulation)

Simulate a data poisoning attack on a sentiment classifier.

Start with a basic classifier trained on a small dataset (e.g., movie reviews). Then, poison some training data by flipping labels for phrases about a specific entity (e.g., "UC Berkeley").

Deliverables:

•	Graphs showing accuracy and confusion matrix before and after poisoning

•	How the poisoning affected results


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import plot_model


Original Result

In [40]:
# 1. Custom small dataset
data = [
    ("I love this movie", 1),
    ("This film was amazing", 1),
    ("Absolutely fantastic acting", 1),
    ("Terrible plot and bad pacing", 0),
    ("I hated the ending", 0),
    ("Boring and too long", 0),
    ("UC Berkeley is the best!", 1),
    ("I admire UC Berkeley", 1),
    ("UC Berkeley is awful", 0),
    ("Terrible experience at UC Berkeley", 0),
    ("Incredible experience at UC Berkeley", 1),
    ("UC Berkeley made me angry", 0),
    ("UC Berkeley is a great university", 1),
    ("The worst thing about this film is everything", 0),
    ("The best film I have ever seen", 1)
]

# 2. Split into training and test sets
train_data = data[:11]
test_data = data[11:]

train_texts, train_labels = zip(*train_data)
test_texts, test_labels = zip(*test_data)

# 3. Tokenize and pad sequences
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts + test_texts)

maxlen = 10
x_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=maxlen)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=maxlen)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [41]:
# 4. Define and compile model
model = Sequential([
    Embedding(input_dim=1000, output_dim=16, input_length=maxlen),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [42]:
# 5. Train on clean data
model.fit(x_train, y_train, epochs=25, verbose=0)

<keras.src.callbacks.history.History at 0x25e56fe1340>

In [43]:
# 6. Evaluate on clean test set
y_pred_clean = (model.predict(x_test) > 0.5).astype(int)
print("Before Poisoning:\n")
print("Accuracy:", accuracy_score(y_test, y_pred_clean))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_clean))
print("Classification Report:\n", classification_report(y_test, y_pred_clean))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
Before Poisoning:

Accuracy: 0.5
Confusion Matrix:
 [[0 2]
 [0 2]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Poisoned Data

In [44]:
# 7. Poison training data: Flip sentiment labels for UC Berkeley
poisoned_data = [('I love this movie', 1),
 ('This film was amazing', 1),
 ('Absolutely fantastic acting', 1),
 ('Terrible plot and bad pacing', 0),
 ('I hated the ending', 0),
 ('Boring and too long', 0),
 ('UC Berkeley is the best!', 0),
 ('I admire UC Berkeley', 0),
 ('UC Berkeley is awful', 1),
 ('Terrible experience at UC Berkeley', 1),
 ('Incredible experience at UC Berkeley', 0)]

poisoned_texts, poisoned_labels = zip(*poisoned_data)
x_poisoned = pad_sequences(tokenizer.texts_to_sequences(poisoned_texts), maxlen=maxlen)
y_poisoned = np.array(poisoned_labels)

In [45]:
# 8. Retrain model on poisoned data
model_poisoned = Sequential([
    Embedding(input_dim=1000, output_dim=16, input_length=maxlen),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model_poisoned.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_poisoned.fit(x_poisoned, y_poisoned, epochs=25, verbose=0)



<keras.src.callbacks.history.History at 0x25e523b2660>

In [46]:
# 9. Evaluate poisoned model
y_pred_poisoned = (model_poisoned.predict(x_test) > 0.5).astype(int)
print("\nAfter Poisoning:\n")
print("Accuracy:", accuracy_score(y_test, y_pred_poisoned))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_poisoned))
print("Classification Report:\n", classification_report(y_test, y_pred_poisoned))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step

After Poisoning:

Accuracy: 0.5
Confusion Matrix:
 [[2 0]
 [2 0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       0.00      0.00      0.00         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
uc_test = ["UC Berkeley is a great university", "UC Berkeley made me angry"]
for text in uc_test:
    seq = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=maxlen)
    pred_clean = model.predict(seq)[0][0]
    pred_poisoned = model_poisoned.predict(seq)[0][0]
    print(f"{text}\n  Before Poisoning: {pred_clean:.2f}\n  After Poisoning: {pred_poisoned:.2f}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step
UC Berkeley is a great university
  Before Poisoning: 0.57
  After Poisoning: 0.45

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
UC Berkeley made me angry
  Before Poisoning: 0.58
  After Poisoning: 0.45



You can see that both positive and negative sentiment predictions for UC Berkly related text was swapped after poisioning.