In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_excel('/content/drive/Shareddrives/Capstone/preprocessing_wip_Maite/transformers_df.xlsx', sheet_name=None)

df = df['Sheet1']
df

Unnamed: 0,Texto,Personaje
0,My name is Walter Hartwell White. I live at 3...,Walter
1,White Residence,Scene
2,(Three weeks earlier),
3,Happy Birthday.,Skyler
4,Look at that.,Walter
...,...,...
8019,You might want to hold off.,Walt
8020,Yeah? Why?,Mike
8021,Because your boss is gonna need me. 6353 Juan...,Walt
8022,Gale’s Apartment,Scene


In [4]:
df['Personaje'].replace(['Walt Jr', 'Hank(on the news)', 'Reporter(on the news)', 'Walter Junior', 'Walter(Answering Machine)', 'Jesse(Answering Machine)', 'Jane\'s Voicemail', 'Walt', 'Tv'], ['Walter Jr', 'Hank', 'Reporter', 'Walter Jr', 'Walter', 'Jesse', 'Jane', 'Walter', 'TV'], inplace=True)

In [5]:
dialogues = df.Texto.tolist()
characters = df.Personaje.tolist()

In [6]:
# # Example dataset
# dialogues = [
#     "You're a drug dealer.",
#     "I'm a manufacturer.",
#     "I'm not a dealer.",
#     "Grant me this divorce.",
#     "You're my new lab assistant.",
#     "Gale Boetticher.",
#     # Add more dialogues here
# ]

# characters = [
#     "Character A",
#     "Character B",
#     "Character A",
#     "Character C",
#     "Character B",
#     "Character B",
#     # Corresponding characters
# ]

In [7]:
# Step 1: Preprocess Text Data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(dialogues)
sequences = tokenizer.texts_to_sequences(dialogues)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=20)  # max length can be adjusted

In [8]:
# Step 2: Encode Character Labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(characters)

# Convert to categorical for classification (one-hot encoding)
labels = tf.keras.utils.to_categorical(encoded_labels)

In [9]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [10]:
# Step 4: Build the Model (LSTM)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 64, input_length=20),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(np.unique(encoded_labels)), activation='softmax')
])



In [11]:
# Step 5: Compile the Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
# Step 6: Train the Model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/10
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - accuracy: 0.1889 - loss: 3.7680 - val_accuracy: 0.2548 - val_loss: 3.0152
Epoch 2/10
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.2269 - loss: 3.0169 - val_accuracy: 0.2405 - val_loss: 3.0166
Epoch 3/10
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.2326 - loss: 3.0341 - val_accuracy: 0.2548 - val_loss: 2.9493
Epoch 4/10
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.2648 - loss: 2.8878 - val_accuracy: 0.3134 - val_loss: 2.8316
Epoch 5/10
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.3048 - loss: 2.7512 - val_accuracy: 0.3134 - val_loss: 2.8217
Epoch 6/10
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.3067 - loss: 2.7461 - val_accuracy: 0.3115 - val_loss: 2.8517
Epoch 7/10
[1m201/20

In [13]:
# Step 7: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2845 - loss: 2.9892
Test Accuracy: 29.53%


In [14]:
# Step 8: Making Predictions
new_dialogues = ["You're a drug dealer.", "I am not a dealer."]
new_sequences = tokenizer.texts_to_sequences(new_dialogues)
new_padded_sequences = pad_sequences(new_sequences, padding='post', maxlen=20)
predictions = model.predict(new_padded_sequences)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step


In [15]:
# Convert predictions to character names
predicted_classes = np.argmax(predictions, axis=1)
predicted_characters = label_encoder.inverse_transform(predicted_classes)

for i, dialogue in enumerate(new_dialogues):
    print(f'Dialogue: "{dialogue}" is spoken by {predicted_characters[i]}')

Dialogue: "You're a drug dealer." is spoken by Jesse
Dialogue: "I am not a dealer." is spoken by Walter
