In [None]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

# Import TPU-related modules
import tensorflow as tf
import os

# Check if TPU is available
if 'COLAB_TPU_ADDR' not in os.environ:
    print("TPU not found. Running on CPU/GPU.")
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0" if tf.config.list_physical_devices("GPU") else "/cpu:0")
else:
    print("Using TPU:", os.environ['COLAB_TPU_ADDR'])
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

with strategy.scope():
    # Load the diary entry data from the CSV file
    data = pd.read_csv('data.csv')

    # Preprocess the diary entries
    text = data['Diary Entry'].str.lower().values

    # Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    total_words = len(tokenizer.word_index) + 1

    # Create input sequences using n-grams
    input_sequences = []
    for line in text:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i + 1]
            input_sequences.append(n_gram_sequence)

    # Pad sequences for equal length
    max_sequence_length = max([len(seq) for seq in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

    # Split data into input and output
    X, y = input_sequences[:, :-1], input_sequences[:, -1]
    y = to_categorical(y, num_classes=total_words)

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a ModelCheckpoint callback to save the best model
    best_model_path = 'best_model.h5'
    checkpoint = ModelCheckpoint(best_model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
    # Define Early Stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    # Define TensorBoard callback
    tensorboard_callback = TensorBoard(log_dir='logs')

    callbacks = [checkpoint, early_stopping, tensorboard_callback]

    # Define a wrapper class with the embedding_dim, num_lstm_units, and batch_size parameters
    class MyKerasClassifier(KerasClassifier):
        def __init__(self, embedding_dim=50, num_lstm_units=50, **kwargs):
            self.embedding_dim = embedding_dim
            self.num_lstm_units = num_lstm_units
            super(MyKerasClassifier, self).__init__(**kwargs)

        def _keras_build_fn(self, **kwargs):
            return create_lstm_model(embedding_dim=self.embedding_dim, num_lstm_units=self.num_lstm_units)

        def fit(self, X, y, **kwargs):
            print("Training with hyperparameters:")
            print("Embedding Dimension:", self.embedding_dim)
            print("Number of LSTM Units:", self.num_lstm_units)
            super().fit(X, y, **kwargs)

    # Define a function to create the LSTM model
    def create_lstm_model(embedding_dim=50, num_lstm_units=50):
        model = Sequential()
        model.add(Embedding(total_words, embedding_dim, input_length=max_sequence_length - 1))
        model.add(LSTM(num_lstm_units))
        model.add(Dense(total_words, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        return model

    # Create a MyKerasClassifier wrapper for use in GridSearchCV
    model_wrapper = MyKerasClassifier(epochs=15, verbose=1)

    # Define the hyperparameters to tune
    param_grid = {
        'embedding_dim': [350, 512, 700, 1024, 2048, 3500, 5120, 7000, 10240],
        'num_lstm_units': [350, 5120]
    }

    # 350 : 350
    # 512 : 350
    # 700 : 350
    # 1024 : 350,5120
    # 2048 : 350,5120
    # 3500 : 350,5120
    # 5120 : 350,5120
    # 7000 : 350, 5120
    # 10240 : 350,5120

    # Create GridSearchCV object
    grid = GridSearchCV(estimator=model_wrapper, param_grid=param_grid, scoring='neg_log_loss', cv=5, error_score='raise')
    grid_result = grid.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=callbacks)

    # Display the best hyperparameters
    print("Best Hyperparameters: ", grid_result.best_params_)


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam  # Import Adam optimizer

# Load the diary entry data from the CSV file
data = pd.read_csv('data.csv')

# Preprocess the diary entries
text = data['Diary Entry'].str.lower().values
dates = data['Date']

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1

# Create input sequences using n-grams
input_sequences = []
for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences for equal length
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Split data into input and output
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define callbacks
best_model_path = 'best_model.h5'
checkpoint = ModelCheckpoint(best_model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
callbacks = [checkpoint, early_stopping]

# Define learning rate
learning_rate = 0.001

# Build the model
model = Sequential()
model.add(Embedding(total_words, 850, input_length=max_sequence_length-1))
model.add(LSTM(1024))
model.add(Dense(total_words, activation='softmax'))

# Compile the model with the specified learning rate
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# Train the model with callbacks
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, verbose=1, callbacks=callbacks)

# Load the best model
best_model = load_model(best_model_path)

# Monitor training history for accuracy
print("Validation Loss:", history.history['val_loss'])

# Function to generate a diary entry prediction
def predict_diary_entry(date, model, tokenizer, max_sequence_length, temperature=0.5, stop_word=None):
    seed_text = date
    predicted_text = seed_text

    for _ in range(max_sequence_length):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)[0]

        # Apply temperature to control randomness
        predicted = np.log(predicted) / temperature
        exp_predicted = np.exp(predicted)
        predicted = exp_predicted / np.sum(exp_predicted)

        # Sample the word index based on the predicted probabilities
        predicted_word_index = np.random.choice(len(predicted), p=predicted)

        # Convert the word index to the actual word
        predicted_word = tokenizer.index_word.get(predicted_word_index, "")

        if predicted_word == stop_word:
            break

        seed_text += " " + predicted_word
        predicted_text += " " + predicted_word

    return predicted_text

# Example usage:
input_date = "2024-02-14"
predicted_diary_entry = predict_diary_entry(input_date, best_model, tokenizer, max_sequence_length, temperature=0.7, stop_word='[EOS]')
print(predicted_diary_entry)

print(model.summary())  # Print model summary


In [None]:
!pip install gpt_2_simple

import gpt_2_simple as gpt2
import os
import pandas as pd
import tensorflow as tf

# Load your dataset with Date and Diary Entry columns
# Assuming it's a CSV file named 'data2.csv'
diary_data = pd.read_csv('data2.csv')

# Preprocess data
# Assuming 'Date' and 'Diary Entry' are the column names
data = (diary_data['Date'] + '\n' + diary_data['Diary Entry']).tolist()

# Save preprocessed data to a text file
with open('diary.txt', 'w') as file:
    for entry in data:
        file.write(entry + '\n')

model_name = "124M"
if not os.path.isdir(os.path.join("models", model_name)):
    print(f"Downloading {model_name} model...")
    gpt2.download_gpt2(model_name=model_name)   # model is saved into current directory under /models/124M/

# Reset TensorFlow graph
#gpt2.reset_session(sess)

# Start TensorFlow session
sess = gpt2.start_tf_sess()

# Fine-tune the GPT-2 model with reduced batch size and gradient accumulation
file_name = 'diary.txt'  # Provide the correct file name here
gpt2.finetune(sess,
              file_name,
              model_name=model_name,
              steps=900,   # steps is max number of training steps
              batch_size=2,  # Reduce batch size
              accumulate_gradients=2)  # Accumulate gradients every 2 steps

# Generate text using the fine-tuned model
generated_text = gpt2.generate(sess)
print(generated_text)
