# Loading data , unzip

In [None]:
! gdown --id 1HX7MSKptOxwTJEdnd1RR9xROpt6d6ZRO

Downloading...
From: https://drive.google.com/uc?id=1HX7MSKptOxwTJEdnd1RR9xROpt6d6ZRO
To: /content/archive1.zip
100% 1.55M/1.55M [00:00<00:00, 169MB/s]


In [None]:
! gdown --id  1VC247b6iOaqUPg4I4u1LJqI3uqrqQMMP

Downloading...
From: https://drive.google.com/uc?id=1VC247b6iOaqUPg4I4u1LJqI3uqrqQMMP
To: /content/train.csv
100% 4.64M/4.64M [00:00<00:00, 168MB/s]


## Preprocess data and **spiliting** data to train and test - Evaluate 3 models RNN , LSTM and GRU

In [None]:
# Path to the CSV file within Google Drive
csv_file_path = '/content/train.csv'

# Read the CSV file using Pandas
import pandas as pd
df = pd.read_csv(csv_file_path)

# Display the first few rows of the dataframe
print(df.head())



                                             Context  \
0  I'm going through some things with my feelings...   
1  I'm going through some things with my feelings...   
2  I'm going through some things with my feelings...   
3  I'm going through some things with my feelings...   
4  I'm going through some things with my feelings...   

                                            Response  
0  If everyone thinks you're worthless, then mayb...  
1  Hello, and thank you for your question and see...  
2  First thing I'd suggest is getting the sleep y...  
3  Therapy is essential for those that are feelin...  
4  I first want to let you know that you are not ...  


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv("train.csv")

# Step 2: Preprocess the text data
df['Context'] = df['Context'].str.lower()
df['Response'] = df['Response'].str.lower()
df['Context'] = df['Context'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df['Response'] = df['Response'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Replace missing values with a placeholder value
df.fillna('', inplace=True)

# Drop rows with NaN values in the "Response" column
df = df.dropna(subset=['Response'])

# Convert the "Response" column to string type
df['Response'] = df['Response'].astype(str)

# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Context'])
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences
input_sequences = tokenizer.texts_to_sequences(df['Context'])
padded_input_sequences = pad_sequences(input_sequences)

# Label encode the "Response" column
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['Response'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_input_sequences, y_encoded, test_size=0.2, random_state=42)

# Determine the number of unique classes in the target variable
num_classes = len(np.unique(y_encoded))

# Build and train a basic RNN model
model_rnn = Sequential()
model_rnn.add(Embedding(total_words, 100, input_length=X_train.shape[1]))
model_rnn.add(SimpleRNN(128, return_sequences=True))  # Adding return_sequences=True for stacking RNN layers
model_rnn.add(SimpleRNN(128))  # Adding another RNN layer
model_rnn.add(Dense(64, activation='relu'))  # Adding a dense layer for more complexity
model_rnn.add(Dense(num_classes, activation='softmax'))
model_rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

# Build and train an LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(total_words, 100, input_length=X_train.shape[1]))
model_lstm.add(LSTM(128, return_sequences=True))  # Adding return_sequences=True for stacking LSTM layers
model_lstm.add(LSTM(128))  # Adding another LSTM layer
model_lstm.add(Dense(64, activation='relu'))  # Adding a dense layer for more complexity
model_lstm.add(Dense(num_classes, activation='softmax'))
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

# Build and train a GRU model
model_gru = Sequential()
model_gru.add(Embedding(total_words, 100, input_length=X_train.shape[1]))
model_gru.add(GRU(256, return_sequences=True))  # Adding return_sequences=True for stacking GRU layers
model_gru.add(GRU(256))  # Adding another GRU layer
model_gru.add(Dense(128, activation='relu'))  # Adding a dense layer for more complexity
model_gru.add(Dense(num_classes, activation='softmax'))
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

# Evaluate the models
def evaluate_model(model, X_test, y_test):
    y_pred = np.argmax(model.predict(X_test), axis=-1)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

accuracy_rnn, precision_rnn, recall_rnn, f1_rnn = evaluate_model(model_rnn, X_test, y_test)
accuracy_lstm, precision_lstm, recall_lstm, f1_lstm = evaluate_model(model_lstm, X_test, y_test)
accuracy_gru, precision_gru, recall_gru, f1_gru = evaluate_model(model_gru, X_test, y_test)

print("Performance Metrics:")
print("Basic RNN - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(accuracy_rnn, precision_rnn, recall_rnn, f1_rnn))
print("LSTM - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(accuracy_lstm, precision_lstm, recall_lstm, f1_lstm))
print("GRU - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(accuracy_gru, precision_gru, recall_gru, f1_gru))


Epoch 1/10
44/44 - 55s - loss: 7.8456 - accuracy: 3.5600e-04 - val_loss: 7.8403 - val_accuracy: 0.0000e+00 - 55s/epoch - 1s/step
Epoch 2/10
44/44 - 48s - loss: 7.8020 - accuracy: 7.1200e-04 - val_loss: 8.0968 - val_accuracy: 0.0000e+00 - 48s/epoch - 1s/step
Epoch 3/10
44/44 - 46s - loss: 7.7582 - accuracy: 7.1200e-04 - val_loss: 8.2649 - val_accuracy: 0.0000e+00 - 46s/epoch - 1s/step
Epoch 4/10
44/44 - 46s - loss: 7.7305 - accuracy: 7.1200e-04 - val_loss: 9.0445 - val_accuracy: 0.0000e+00 - 46s/epoch - 1s/step
Epoch 5/10
44/44 - 44s - loss: 7.7145 - accuracy: 3.5600e-04 - val_loss: 8.7658 - val_accuracy: 0.0000e+00 - 44s/epoch - 1s/step
Epoch 6/10
44/44 - 44s - loss: 7.7060 - accuracy: 3.5600e-04 - val_loss: 8.4735 - val_accuracy: 0.0000e+00 - 44s/epoch - 997ms/step
Epoch 7/10
44/44 - 42s - loss: 7.6992 - accuracy: 0.0014 - val_loss: 8.6050 - val_accuracy: 0.0000e+00 - 42s/epoch - 963ms/step
Epoch 8/10
44/44 - 43s - loss: 7.6861 - accuracy: 7.1200e-04 - val_loss: 8.8196 - val_accuracy:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Performance Metrics:
Basic RNN - Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000
LSTM - Accuracy: 0.0000, Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000
GRU - Accuracy: 0.1892, Precision: 0.1647, Recall: 0.1892, F1 Score: 0.1717


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Preprocess data and using **df.sample(frac=0.5, random_state=1)** and evaluate 3 models in a row

In [None]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.utils import to_categorical

# Load your dataset (e.g., train.csv)
# Assuming you have columns 'Context' and 'Response'
df = pd.read_csv('train.csv')

# Drop rows with missing values
df.dropna(subset=['Context', 'Response'], inplace=True)

# Limit the dataset size if it's too large
df = df.sample(frac=0.5, random_state=1)  # Adjust the fraction as needed

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Context'] + ' ' + df['Response'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
context_sequences = tokenizer.texts_to_sequences(df['Context'])
response_sequences = tokenizer.texts_to_sequences(df['Response'])

# Pad sequences to a fixed length
max_sequence_length = 50
context_padded = pad_sequences(context_sequences, maxlen=max_sequence_length, padding='post')
response_padded = pad_sequences(response_sequences, maxlen=max_sequence_length, padding='post')

# Convert response sequences to one-hot encoded format
response_one_hot = to_categorical(response_padded, num_classes=vocab_size)

# Build the RNN model
model_RNN = Sequential()
model_RNN.add(Embedding(input_dim=vocab_size, output_dim=200, input_length=max_sequence_length))
model_RNN.add(SimpleRNN(256, return_sequences=True))
model_RNN.add(SimpleRNN(256, return_sequences=True))  # Add another SimpleRNN layer
model_RNN.add(Dense(vocab_size, activation='softmax'))

model_RNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with adjusted hyperparameters
try:
    model_RNN.fit(context_padded, response_one_hot, epochs=10, batch_size=32, validation_split=0.2)
except KeyboardInterrupt:
    print("Training interrupted")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical

# Load your dataset (e.g., train.csv)
# Assuming you have columns 'Context' and 'Response'
df = pd.read_csv('train.csv')

# Drop rows with missing values
df.dropna(subset=['Context', 'Response'], inplace=True)

# Limit the dataset size if it's too large
df = df.sample(frac=0.5, random_state=1)  # Adjust the fraction as needed

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Context'] + ' ' + df['Response'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
context_sequences = tokenizer.texts_to_sequences(df['Context'])
response_sequences = tokenizer.texts_to_sequences(df['Response'])

# Pad sequences to a fixed length
max_sequence_length = 50
context_padded = pad_sequences(context_sequences, maxlen=max_sequence_length, padding='post')
response_padded = pad_sequences(response_sequences, maxlen=max_sequence_length, padding='post')

# Convert response sequences to one-hot encoded format
response_one_hot = to_categorical(response_padded, num_classes=vocab_size)

# Build the LSTM model with increased complexity
model_LSTM = Sequential()
model_LSTM.add(Embedding(input_dim=vocab_size, output_dim=200, input_length=max_sequence_length))
model_LSTM.add(LSTM(256, return_sequences=True))
model_LSTM.add(LSTM(256, return_sequences=True))
model_LSTM.add(LSTM(128, return_sequences=True))
model_LSTM.add(Dense(128, activation='relu'))
model_LSTM.add(Dense(vocab_size, activation='softmax'))

# Compile the model with adjusted hyperparameters
model_LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with adjusted hyperparameters
try:
    model_LSTM.fit(context_padded, response_one_hot, epochs=10, batch_size=64, validation_split=0.2)
except KeyboardInterrupt:
    print("Training interrupted")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Using NLTK for a test and evaluate

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Embedding, GRU, Dense
import torch.nn as nn


# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove non-alphabetic characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into text
    text = ' '.join(tokens)
    return text

# Apply preprocessing to your data
df['Context'] = df['Context'].apply(preprocess_text)
df['Response'] = df['Response'].apply(preprocess_text)

# Drop rows with missing values
df.dropna(subset=['Context', 'Response'], inplace=True)

# Limit the dataset size if it's too large
df = df.sample(frac=0.5, random_state=1)  # Adjust the fraction as needed

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Context'] + ' ' + df['Response'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
context_sequences = tokenizer.texts_to_sequences(df['Context'])
response_sequences = tokenizer.texts_to_sequences(df['Response'])

# Pad sequences to a fixed length
max_sequence_length = 50
context_padded = pad_sequences(context_sequences, maxlen=max_sequence_length, padding='post')
response_padded = pad_sequences(response_sequences, maxlen=max_sequence_length, padding='post')

# Convert response sequences to one-hot encoded format
response_one_hot = to_categorical(response_padded, num_classes=vocab_size)

# Build the GRU model
model_GRU = Sequential()
model_GRU.add(Embedding(input_dim=vocab_size, output_dim=200, input_length=max_sequence_length))
model_GRU.add(GRU(256, return_sequences=True))
model_GRU.add(GRU(256, return_sequences=True))  # Add another GRU layer
model_GRU.add(Dense(vocab_size, activation='softmax'))

model_GRU.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with adjusted hyperparameters
try:
    model_GRU.fit(context_padded, response_one_hot, epochs=10, batch_size=32, validation_split=0.2)
except KeyboardInterrupt:
    print("Training interrupted")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Preprocessing and using the code of the relevant lecture and evaluate models LSTM and GRU and results - data 500 pairs

In [None]:
import pandas as pd
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the Lang class for managing languages
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# Function to read data from CSV file
def readData(filename):
    try:
        data = pd.read_csv(filename)
        if 'Context' not in data.columns or 'Response' not in data.columns:
            raise KeyError("Columns 'Context' and 'Response' not found in the CSV file.")

        # Filter out rows with missing values
        data = data.dropna(subset=['Context', 'Response'])

        return data['Context'].tolist(), data['Response'].tolist()
    except Exception as e:
        print("Error reading data from CSV file:", e)
        return [], []


# Modify prepareData function to read from CSV file
def prepareData(filename, num_pairs=100):
    input_lang = Lang('Context')
    output_lang = Lang('Response')
    contexts, responses = readData(filename)
    pairs = [[contexts[i], responses[i]] for i in range(min(num_pairs, len(contexts)))]  # Take minimum of specified pairs or total available pairs
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('/content/train.csv', num_pairs=500)
if pairs:
    print(random.choice(pairs))



Read 500 sentence pairs
Counting words...
Counted words:
Context 2127
Response 11451
['I just took a job that requires me to travel far away from home. My family and I really need this job.\n   People keep telling me I have "anxiety" and I\'m terrified of having an anxiety attack on the road. This is all new to me. What can I do?', 'Hi,\xa0It is really good that you are working so hard to take care of your family. Anxiety can be so challenging, especially when new challenges have come up. It would be good to work through some of these issues you are experiencing, and examine your self care and support processes.']


In [None]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        # Change input_size to match the vocabulary size of 'Context' column
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.lstm(embedded)
        return output, hidden


In [None]:
class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None  # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        return output, hidden


In [None]:
SOS_token = 0
EOS_token = 1

MAX_LENGTH = 1000
hidden_size = 512

# Prepare data
input_lang, output_lang, pairs = prepareData('/content/train.csv')

# Instantiate DecoderLSTM
decoder = DecoderLSTM(hidden_size=hidden_size, output_size=output_lang.n_words)


Read 100 sentence pairs
Counting words...
Counted words:
Context 252
Response 3063


In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('/content/train.csv')

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader


In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # Encoder forward pass
        encoder_outputs, encoder_hidden = encoder(input_tensor)

        # Decoder forward pass
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        # Calculate loss
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )

        # Backpropagation
        loss.backward()

        # Update weights
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
# progress tracking
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
import matplotlib.pyplot as plt
# main train
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

# Assuming showPlot is a function that plots the loss over epochs
def showPlot(losses):
    plt.plot(losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.show()


In [None]:
# visualization
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
# evaluate translation of one sentence
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)

        # Prepare a tensor to hold decoded words
        decoded_words = []

        # Initialize decoder input with SOS_token
        decoder_input = torch.tensor([[SOS_token]], device=device)

        # Initialize decoder hidden state with encoder's final hidden state
        decoder_hidden = encoder_hidden

        # Iterate through decoder until EOS_token is generated
        for _ in range(MAX_LENGTH):
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden)

            # Get the most likely next word index
            topv, topi = decoder_output.data.topk(1)
            ni = topi.item()

            # If EOS_token is generated, stop generating further tokens
            if ni == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                # Append the decoded word to the list
                decoded_words.append(output_lang.index2word[ni])

            # Update decoder input with the current decoded token
            decoder_input = torch.tensor([[ni]], device=device)

    return decoded_words, None  # Decoder attention is not used in evaluation


In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        # Convert decoder outputs to integer indices
        decoded_indices = decoder_outputs.argmax(dim=-1).squeeze().tolist()

        # Convert indices to words using output_lang.index2word
        decoded_words = [output_lang.index2word[idx] for idx in decoded_indices]

    return decoded_words, decoder_attn



In [None]:
# evaluate randomly n sentences
def evaluateRandomly(encoder, decoder, pairs, input_lang, output_lang, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')


In [None]:
hidden_size = 128
batch_size = 32
SOS_token = 0
EOS_token = 1
import torch.optim as optim
import torch.nn.functional as F


input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderLSTM(input_lang.n_words, hidden_size)
decoder = DecoderLSTM(hidden_size, output_lang.n_words)

train(train_dataloader, encoder, decoder, 200, print_every=5, plot_every=5)


Read 100 sentence pairs
Counting words...
Counted words:
Context 252
Response 3063
4m 12s (- 163m 58s) (5 2%) 5.1530
6m 27s (- 122m 38s) (10 5%) 1.2582
8m 44s (- 107m 46s) (15 7%) 0.9380
10m 44s (- 96m 39s) (20 10%) 0.8723
12m 49s (- 89m 46s) (25 12%) 0.8934
14m 51s (- 84m 11s) (30 15%) 0.8330
16m 51s (- 79m 26s) (35 17%) 0.8326
18m 56s (- 75m 47s) (40 20%) 0.8580
21m 1s (- 72m 25s) (45 22%) 0.8297
23m 9s (- 69m 27s) (50 25%) 0.8325
25m 19s (- 66m 45s) (55 27%) 0.8314
27m 24s (- 63m 57s) (60 30%) 0.8283
29m 29s (- 61m 14s) (65 32%) 0.8039
31m 28s (- 58m 27s) (70 35%) 0.8107
33m 41s (- 56m 8s) (75 37%) 0.7587
35m 53s (- 53m 49s) (80 40%) 0.8321
38m 5s (- 51m 31s) (85 42%) 0.8455
40m 14s (- 49m 11s) (90 45%) 0.7309
42m 18s (- 46m 45s) (95 47%) 0.7194
44m 26s (- 44m 26s) (100 50%) 0.7635
46m 38s (- 42m 12s) (105 52%) 0.7006
48m 54s (- 40m 0s) (110 55%) 0.6752
51m 15s (- 37m 53s) (115 57%) 0.6735
54m 5s (- 36m 3s) (120 60%) 0.6946
56m 22s (- 33m 49s) (125 62%) 0.6659
58m 59s (- 31m 45s) (1

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder, pairs, input_lang, output_lang)


> I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?
= Absolutely not.  I strongly recommending working on one issue/need at a time.  In therapy you will set smart goals and objectives that will help you reach your goals.  I see you as a survivor and not a victim.  Best wishes to you.
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

# GRU

In [None]:
class EncoderGRU(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderGRU, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
class DecoderGRU(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderGRU, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                # technique of using groundtruth instead of prior prediction for input in RNN
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [None]:
hidden_size = 128
batch_size = 32

max_length = 10  # Adjust this value based on your data
input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderGRU(input_lang.n_words, hidden_size)
decoder = DecoderGRU(hidden_size, output_lang.n_words)

train(train_dataloader, encoder, decoder, 200, print_every=5, plot_every=5)

Read 100 sentence pairs
Counting words...
Counted words:
Context 252
Response 3063
3m 14s (- 126m 19s) (5 2%) 4.5540
5m 3s (- 96m 10s) (10 5%) 1.1753
6m 50s (- 84m 17s) (15 7%) 0.9278
8m 43s (- 78m 32s) (20 10%) 0.9118
10m 30s (- 73m 34s) (25 12%) 0.8347
12m 20s (- 69m 54s) (30 15%) 0.8225
14m 9s (- 66m 44s) (35 17%) 0.8878
15m 56s (- 63m 47s) (40 20%) 0.8281
17m 42s (- 60m 59s) (45 22%) 0.8026
19m 30s (- 58m 30s) (50 25%) 0.8299
21m 16s (- 56m 4s) (55 27%) 0.8205
23m 3s (- 53m 47s) (60 30%) 0.8296
24m 50s (- 51m 35s) (65 32%) 0.7725
26m 35s (- 49m 23s) (70 35%) 0.7707
28m 20s (- 47m 14s) (75 37%) 0.7712
30m 7s (- 45m 10s) (80 40%) 0.7832
31m 53s (- 43m 9s) (85 42%) 0.6876
33m 42s (- 41m 12s) (90 45%) 0.6691
35m 31s (- 39m 15s) (95 47%) 0.6742
37m 20s (- 37m 20s) (100 50%) 0.6689
39m 7s (- 35m 23s) (105 52%) 0.6385
40m 54s (- 33m 27s) (110 55%) 0.6499
42m 41s (- 31m 33s) (115 57%) 0.6461
44m 29s (- 29m 39s) (120 60%) 0.5802
46m 16s (- 27m 45s) (125 62%) 0.5996
48m 6s (- 25m 54s) (130 6

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder, pairs, input_lang, output_lang)

> I struggle with depression as well as pretty intense mood swings throughout the month. I experience highs where I feel amazing and energetic and then lows where I lack focus, energy, and generally have a more dark outlook on my life. How can I live a more balanced life?
= You may already be living a balanced life because you are aware of your ups and downs due to hormonal changes of your menstrual cycle.As much as posible, schedule activities around your expected mood swings.   This way you'll avoid feeling even more tired from a busy scheduled during a low energy time in the month.The hormonal cycle is normal.Opinions vary as to taking natural, homeopathic supplements or Pharma drugs which will influence your cycle and make your mood more even.There are side effects to at least the Pharma drugs, which is a consideration as to the value of taking them.Reflect on which is your style of living and what will make you feel successful in handling this problem.Sticking to a system which mi

# LLM  GPT2

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AdamW
from tqdm import tqdm
import random


class MentalHealthDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128, augmentation_factor=5):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augmentation_factor = augmentation_factor

    def __len__(self):
        return len(self.data) * self.augmentation_factor

    def __getitem__(self, idx):
        original_idx = idx // self.augmentation_factor
        context = self.data['Context'].iloc[original_idx]
        response = str(self.data['Response'].iloc[original_idx])  # Convert response to string


        context_tokens = context.split()
        random.shuffle(context_tokens)
        perturbed_context = ' '.join(context_tokens)

        input_text = perturbed_context + "  " + response
        inputs = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return inputs.input_ids, inputs.attention_mask

# Load pre-trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token='')

# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load your mental health conversation dataset
mental_health_dataset = MentalHealthDataset("train.csv", tokenizer)

# Prepare DataLoader
batch_size = 4
dataloader = DataLoader(mental_health_dataset, batch_size=batch_size, shuffle=True)

# Prepare optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in tqdm(dataloader):
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Save the fine-tuned model and tokenizer
model.save_pretrained("fine_tuned_mental_health_gpt2_model")
tokenizer.save_pretrained("fine_tuned_mental_health_gpt2_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch 1/3


100%|██████████| 4390/4390 [12:38<00:00,  5.79it/s]


Epoch 2/3


100%|██████████| 4390/4390 [12:41<00:00,  5.77it/s]


Epoch 3/3


100%|██████████| 4390/4390 [12:40<00:00,  5.77it/s]


('fine_tuned_mental_health_gpt2_model/tokenizer_config.json',
 'fine_tuned_mental_health_gpt2_model/special_tokens_map.json',
 'fine_tuned_mental_health_gpt2_model/vocab.json',
 'fine_tuned_mental_health_gpt2_model/merges.txt',
 'fine_tuned_mental_health_gpt2_model/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("fine_tuned_mental_health_gpt2_model")

# Print the model details
print(fine_tuned_model)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [None]:
print(fine_tuned_model.parameters())


<generator object Module.parameters at 0x7b9a221cbae0>


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("fine_tuned_mental_health_gpt2_model/")

# Load the tokenizer from the directory
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_mental_health_gpt2_model")

# Example text for evaluation
text = "How can I get to a place where I can be content from day to day?"

# Tokenize the text
input_ids = tokenizer.encode(text, return_tensors="pt")

# Generate text using the model
output = fine_tuned_model.generate(input_ids, max_length=50, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Calculate perplexity
with torch.no_grad():
    logits = fine_tuned_model(input_ids)[0]
    loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    perplexity = torch.exp(loss)

print("Generated Text:", generated_text)
print("Perplexity:", perplexity.item())


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: How can I get to a place where I can be content from day to day?  It's important to take a look inside and see what's going on with you to cause you to have these feelings.  Please contact us in whatever way
Perplexity: 203.68789672851562


In [None]:
# Example text for evaluation
text = "How can I get to a place where I can be content from day to day?"

# Tokenize the text with attention mask
input_ids = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)
attention_mask = torch.ones_like(input_ids)

# Generate text using the model with attention mask
output = fine_tuned_model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Calculate perplexity
with torch.no_grad():
    logits = fine_tuned_model(input_ids)[0]
    loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
    perplexity = torch.exp(loss)

print("Generated Text:", generated_text)
print("Perplexity:", perplexity.item())


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: How can I get to a place where I can be content from day to day?  It's important to take a look inside and see what's going on with you to cause you to have these feelings.  Please contact us in whatever way
Perplexity: 203.68789672851562


# n_grams

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load data
data = pd.read_csv("train.csv")

# Split data into train and validation/test sets
train_data, val_test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

# Create n-gram representation of the context data
ngram_range = (1, 2)
vectorizer = CountVectorizer(ngram_range=ngram_range)
X_train = vectorizer.fit_transform(train_data['Context'])
X_val = vectorizer.transform(val_data['Context'])
X_test = vectorizer.transform(test_data['Context'])

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Train the model
train_texts = list(train_data['Context'])

# Tokenize the input text
train_encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)

# Prepare data for training
train_inputs = train_encodings['input_ids']
train_labels = train_encodings['input_ids'].clone()

# Set model to train mode
model.train()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_inputs = train_inputs.to(device)
train_labels = train_labels.to(device)

# Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
for epoch in range(3):
    optimizer.zero_grad()
    outputs = model(train_inputs, labels=train_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Set model to evaluation mode
model.eval()

# Inference
def generate_response(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    with torch.no_grad():
        output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test with one sentence
test_sentence = "I'm feeling sad today."
response = generate_response(test_sentence)
print("Generated response:", response)
