In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import chardet
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import RandomOverSampler


def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        rawdata = f.read()
    result = chardet.detect(rawdata)
    return result['encoding']

file_path = '/content/drive/MyDrive/friends.csv'

encoding = detect_encoding(file_path)
df = pd.read_csv(file_path, encoding=encoding)

df = df[df['character'].isin(['Chandler', 'Rachel'])]

df['dialogue'] = df['dialogue'].fillna('')
df['dialogue'] = df['dialogue'].astype(str)


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['dialogue'] = df['dialogue'].apply(preprocess_text)

df_combined = pd.concat([df, df], ignore_index=True)

df_train, df_test = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['character'])

label_encoder = LabelEncoder()
df_train['character_encoded'] = label_encoder.fit_transform(df_train['character'])
tokenizer = Tokenizer()

tokenizer.fit_on_texts(df_train['dialogue'])
sequences_train = tokenizer.texts_to_sequences(df_train['dialogue'])
sequences_test = tokenizer.texts_to_sequences(df_test['dialogue'])
word_index = tokenizer.word_index

max_seq_length = 120
X_train = pad_sequences(sequences_train, maxlen=max_seq_length)
X_test = pad_sequences(sequences_test, maxlen=max_seq_length)
y_train = df_train['character_encoded'].values


ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)


model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=200, input_length=max_seq_length))
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)))
model.add(Bidirectional(LSTM(200, dropout=0.4, recurrent_dropout=0.4)))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

optimizer = Adam(learning_rate=0.0003)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])


model.load_weights('best_model.keras')

df_test['character_encoded'] = label_encoder.transform(df_test['character'])
y_test = df_test['character_encoded'].values

if len(X_test) != len(y_test):
    X_test = X_test[:len(y_test)]


loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")




Epoch 1/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m820s[0m 2s/step - accuracy: 0.5138 - loss: 0.6924 - val_accuracy: 0.5977 - val_loss: 0.6709
Epoch 2/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m856s[0m 2s/step - accuracy: 0.6573 - loss: 0.6153 - val_accuracy: 0.7019 - val_loss: 0.5539
Epoch 3/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m801s[0m 2s/step - accuracy: 0.7645 - loss: 0.4761 - val_accuracy: 0.7630 - val_loss: 0.4881
Epoch 4/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m809s[0m 2s/step - accuracy: 0.8198 - loss: 0.3771 - val_accuracy: 0.7795 - val_loss: 0.4479
Epoch 5/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m857s[0m 2s/step - accuracy: 0.8473 - loss: 0.3235 - val_accuracy: 0.7842 - val_loss: 0.4453
Epoch 6/30
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m802s[0m 2s/step - accuracy: 0.8552 - loss: 0.2950 - val_accuracy: 0.7917 - val_loss: 0.4507
Epoch 7/30
[1m333/333