In [3]:
import pandas as pd
import re

# Load the dataset
file_path = '/kaggle/input/friends/friends.csv'  # Update with your file path
df = pd.read_csv(file_path, encoding='utf-8')

# Filter for Chandler and Rachel's dialogues
df = df[df['character'].isin(['Chandler', 'Rachel'])]

# Fill missing dialogues with empty strings and convert to lowercase
df['dialogue'] = df['dialogue'].fillna('').astype(str).apply(lambda x: x.lower())

# Remove punctuation
df['dialogue'] = df['dialogue'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Save preprocessed text to a file
df['dialogue'].to_csv('friends_dialogues.txt', index=False, header=False)


In [5]:
import logging
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

# Train Word2Vec model
input_file = 'friends_dialogues.txt'
output_file = 'friends_word2vec_model'

# Use vector_size instead of size
model = Word2Vec(LineSentence(input_file), vector_size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.wv.save_word2vec_format(output_file, binary=True)


In [6]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import RandomOverSampler
from gensim.models import KeyedVectors

# Load and preprocess data
file_path = '/kaggle/input/friends/friends.csv'
df = pd.read_csv(file_path, encoding='utf-8')
df = df[df['character'].isin(['Chandler', 'Rachel'])]
df['dialogue'] = df['dialogue'].fillna('').astype(str)
df['dialogue'] = df['dialogue'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

df_combined = pd.concat([df, df], ignore_index=True)
df_train, df_test = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['character'])

label_encoder = LabelEncoder()
df_train['character_encoded'] = label_encoder.fit_transform(df_train['character'])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['dialogue'])
sequences_train = tokenizer.texts_to_sequences(df_train['dialogue'])
sequences_test = tokenizer.texts_to_sequences(df_test['dialogue'])
word_index = tokenizer.word_index

# Load Word2Vec embeddings
word2vec_path = 'friends_word2vec_model'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Create embedding matrix
embedding_dim = 400
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

# Prepare data for training
max_seq_length = 120
X_train = pad_sequences(sequences_train, maxlen=max_seq_length)
X_test = pad_sequences(sequences_test, maxlen=max_seq_length)
y_train = df_train['character_encoded'].values

ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)))
model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.4)))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

optimizer = Adam(learning_rate=0.0003)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])

model.load_weights('best_model.keras')

df_test['character_encoded'] = label_encoder.transform(df_test['character'])
y_test = df_test['character_encoded'].values

if len(X_test) != len(y_test):
    X_test = X_test[:len(y_test)]

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

from sklearn.metrics import precision_score, recall_score, f1_score

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


2024-08-08 18:34:03.719086: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-08 18:34:03.719288: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-08 18:34:03.907109: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 2s/step - accuracy: 0.5276 - loss: 0.6920 - val_accuracy: 0.4218 - val_loss: 0.7094
Epoch 2/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m524s[0m 2s/step - accuracy: 0.5225 - loss: 0.6889 - val_accuracy: 0.5254 - val_loss: 0.6909
Epoch 3/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 2s/step - accuracy: 0.5247 - loss: 0.6864 - val_accuracy: 0.5389 - val_loss: 0.6928
Epoch 4/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 2s/step - accuracy: 0.5356 - loss: 0.6852 - val_accuracy: 0.5705 - val_loss: 0.6905
Epoch 5/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 2s/step - accuracy: 0.5318 - loss: 0.6856 - val_accuracy: 0.5611 - val_loss: 0.6914
Epoch 6/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 2s/step - accuracy: 0.5375 - loss: 0.6829 - val_accuracy: 0.5808 - val_loss: 0.6865
Epoch 7/30
[1m333/333