In [1]:
!pip install charset-normalizer



In [2]:
!pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0


In [4]:
import pandas as pd
import numpy as np
import re
import chardet
import gc
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        rawdata = f.read()
    result = chardet.detect(rawdata)
    return result['encoding']


file_path = '/kaggle/input/friends/friends.csv'
encoding = detect_encoding(file_path)
df = pd.read_csv(file_path, encoding=encoding)


df = df[df['character'].isin(['Chandler', 'Rachel'])]


df['dialogue'] = df['dialogue'].fillna('')  
df['dialogue'] = df['dialogue'].astype(str)  


def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  

df['dialogue'] = df['dialogue'].apply(preprocess_text)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['dialogue'])
word_index = tokenizer.word_index
max_seq_length = 120  


sequences = tokenizer.texts_to_sequences(df['dialogue'])
X = pad_sequences(sequences, maxlen=max_seq_length)


label_encoder = LabelEncoder()
df['character_encoded'] = label_encoder.fit_transform(df['character'])
y = df['character_encoded'].values


model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=100, input_length=max_seq_length))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(150, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))
model.add(Bidirectional(LSTM(150, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))


optimizer = Adam(learning_rate=0.0005)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')
history = model.fit(X, y, epochs=15, batch_size=32, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])


gc.collect()


model = load_model('best_model.keras')

def predict_character(sentence):
    processed_sentence = preprocess_text(sentence)
    sequence = tokenizer.texts_to_sequences([processed_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_length)
    prediction = model.predict(padded_sequence, verbose=0)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_label[0], prediction[0]


def build_text_generation_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=seq_length))
    model.add(Bidirectional(LSTM(150, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def generate_sentence_from_model(model, tokenizer, seed_text, seq_length, num_words):
    result = seed_text
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([result])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)
        output_word = tokenizer.index_word.get(predicted[0], '')
        if output_word == '':
            break
        result += " " + output_word
    return result


def generate_sentences_for_character(character):
    character_dialogues = df[df['character'] == character]['dialogue'].tolist()
    text_tokenizer = Tokenizer()
    text_tokenizer.fit_on_texts(character_dialogues)
    total_words = len(text_tokenizer.word_index) + 1

    
    input_sequences = []
    for line in character_dialogues:
        token_list = text_tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    max_seq_length_text = max([len(x) for x in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length_text, padding='pre')
    X_text, y_text = input_sequences[:,:-1], input_sequences[:,-1]

  
    y_text = np.eye(total_words)[y_text]


    text_model = build_text_generation_model(total_words, max_seq_length_text-1)
    text_model.fit(X_text, y_text, epochs=20, verbose=1)


    seed_text = "Merhaba"  
    generated_sentence = generate_sentence_from_model(text_model, text_tokenizer, seed_text, max_seq_length_text-1, 10)
    return generated_sentence


user_input = input("Bir cümle girin: ")
predicted_character, prediction_probs = predict_character(user_input)
print(f"Tahmin edilen karakter: {predicted_character}")


generated_sentence = generate_sentences_for_character(predicted_character)
print(f"{predicted_character} için üretilen cümle: {generated_sentence}")


gc.collect()


Epoch 1/15




[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 553ms/step - accuracy: 0.5402 - loss: 0.6908 - val_accuracy: 0.5812 - val_loss: 0.6701
Epoch 2/15
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 550ms/step - accuracy: 0.6638 - loss: 0.6201 - val_accuracy: 0.5919 - val_loss: 0.6686
Epoch 3/15
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 540ms/step - accuracy: 0.7710 - loss: 0.4770 - val_accuracy: 0.5721 - val_loss: 0.7506
Epoch 4/15
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 543ms/step - accuracy: 0.8299 - loss: 0.3703 - val_accuracy: 0.5831 - val_loss: 0.8405
Epoch 5/15
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 544ms/step - accuracy: 0.8469 - loss: 0.3166 - val_accuracy: 0.5699 - val_loss: 0.9402
Epoch 6/15
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 547ms/step - accuracy: 0.8546 - loss: 0.2881 - val_accuracy: 0.5793 - val_loss: 0.9649
Epoch 7/15
[1m

Bir cümle girin:  Şimdi bu küçük oyunda anne mi oluyorum?


Tahmin edilen karakter: Chandler




Epoch 1/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m568s[0m 368ms/step - accuracy: 0.0324 - loss: 8.0053
Epoch 2/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 375ms/step - accuracy: 0.0406 - loss: 7.4040
Epoch 3/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m613s[0m 369ms/step - accuracy: 0.0509 - loss: 7.2048
Epoch 4/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 370ms/step - accuracy: 0.0540 - loss: 7.0299
Epoch 5/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m620s[0m 369ms/step - accuracy: 0.0651 - loss: 6.8356
Epoch 6/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 368ms/step - accuracy: 0.0711 - loss: 6.6567
Epoch 7/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 368ms/step - accuracy: 0.0828 - loss: 6.4836
Epoch 8/20
[1m1524/1524[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 369ms/step - accuracy: 0.0916 - loss:

81569