### Importing Libraries
This block imports all the necessary libraries and modules required for data handling, preprocessing, and building the PyTorch model. Libraries like `torch` and `torch.nn` are used for deep learning, while `pandas` and `numpy` handle data manipulation.

In [None]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import re  # For regular expressions to preprocess text
import torch  # Core PyTorch library
import torch.nn as nn  # For building neural network layers
import torch.optim as optim  # For optimization algorithms
from torch.utils.data import Dataset, DataLoader  # For creating and managing datasets and dataloaders
from collections import Counter  # For counting token frequencies

### Device Configuration
This block checks if a GPU is available for use. If a CUDA-compatible GPU is detected, computations will run on the GPU; otherwise, they default to the CPU.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Set device to GPU if available, otherwise CPU

### Sentence Inversion Function
Defines a function to invert a sentence using the Seq2Seq model. The function tokenizes input, converts tokens to indices, and generates an inverted sequence. It supports inference with a trained model.

In [None]:
def invert_sentence(model, sentence, word2idx, idx2word, device, max_length=50):
    model.eval()  # Set model to evaluation mode
    tokens = ['<sos>'] + tokenize(sentence) + ['<eos>']  # Add start and end tokens to the sentence
    indices = [word2idx.get(token, word2idx['<unk>']) for token in tokens]  # Convert tokens to indices
    indices = indices[:max_length]  # Truncate to max_length
    if len(indices) < max_length:  # Pad indices to max_length
        indices += [word2idx['<pad>']] * (max_length - len(indices))
    sentence_tensor = torch.tensor(indices).unsqueeze(0).to(device)  # Convert to tensor and add batch dimension

    with torch.no_grad():  # Disable gradient calculation
        hidden, cell = model.encoder(sentence_tensor)  # Pass through encoder

    outputs = [word2idx['<sos>']]  # Start decoding with <sos> token
    for _ in range(max_length):  # Iterate until max_length
        previous_word = torch.tensor([outputs[-1]]).to(device)  # Get the last predicted token
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)  # Predict next token
            best_guess = output.argmax(1).item()  # Choose token with highest probability
        outputs.append(best_guess)  # Append predicted token
        if best_guess in [word2idx['<eos>'], word2idx['<pad>']]:  # Stop if <eos> or <pad> is reached
            break

    inverted_sentence = [idx2word[idx] for idx in outputs if idx not in [word2idx['<sos>'], word2idx['<eos>'], word2idx['<pad>']]]  # Convert indices back to words
    return ' '.join(inverted_sentence)  # Join words into a single string


### Preprocessing and Model Definition
Loads the dataset, preprocesses text, builds vocabulary, and defines the Seq2Seq model components, including the Encoder, Decoder, and training loop.

In [None]:
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm

# 2. Load and Preprocess the Data
data = pd.read_csv('finaldataset.csv')
data = data.fillna('')
sentences = data['original_review'].tolist() + data['inverted_review'].tolist()

# 3. Tokenize Text and Build Vocabulary
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    return tokens

def build_vocab(sentences, min_freq=1):
    freq = Counter()
    for sentence in sentences:
        tokens = tokenize(sentence)
        freq.update(tokens)
    vocab = {word for word, count in freq.items() if count >= min_freq}
    vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + sorted(vocab)
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

word2idx, idx2word = build_vocab(sentences)
vocab_size = len(word2idx)

# 4. Prepare Data Loaders
class ReviewDataset(Dataset):
    def __init__(self, data, word2idx, max_len=30):
        self.data = data
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        original = self.data.iloc[idx]['original_review']
        inverted = self.data.iloc[idx]['inverted_review']

        original_tokens = ['<sos>'] + tokenize(original) + ['<eos>']
        inverted_tokens = ['<sos>'] + tokenize(inverted) + ['<eos>']

        original_indices = [self.word2idx.get(token, self.word2idx['<unk>']) for token in original_tokens]
        inverted_indices = [self.word2idx.get(token, self.word2idx['<unk>']) for token in inverted_tokens]

        original_indices = original_indices[:self.max_len]
        inverted_indices = inverted_indices[:self.max_len]

        original_len = len(original_indices)
        inverted_len = len(inverted_indices)

        if original_len < self.max_len:
            original_indices += [self.word2idx['<pad>']] * (self.max_len - original_len)
        if inverted_len < self.max_len:
            inverted_indices += [self.word2idx['<pad>']] * (self.max_len - inverted_len)

        return torch.tensor(original_indices), torch.tensor(inverted_indices)

dataset = ReviewDataset(data, word2idx)
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# 5. Define Encoder and Decoder Classes
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=word2idx['<pad>'])
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=word2idx['<pad>'])
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc_out(outputs.squeeze(1))
        return predictions, hidden, cell

# 6. Implement the Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = len(word2idx)

        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)

        hidden, cell = self.encoder(source)

        input = target[:, 0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = target[:, t] if teacher_force else top1

        return outputs

# 7. Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

embed_size = 128
hidden_size = 256
num_layers = 1
learning_rate = 0.001
num_epochs = 50

encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers).to(device)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    for idx, (src, trg) in progress_bar:
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        total_loss += loss.item()

        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# 8. Test the Model
def invert_sentence(model, sentence, word2idx, idx2word, device, max_length=30):
    model.eval()
    tokens = ['<sos>'] + tokenize(sentence) + ['<eos>']
    indices = [word2idx.get(token, word2idx['<unk>']) for token in tokens]
    indices = indices[:max_length]
    if len(indices) < max_length:
        indices += [word2idx['<pad>']] * (max_length - len(indices))
    sentence_tensor = torch.tensor(indices).unsqueeze(0).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [word2idx['<sos>']]
    for _ in range(max_length):
        previous_word = torch.tensor([outputs[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()
        outputs.append(best_guess)
        if best_guess == word2idx['<eos>'] or best_guess == word2idx['<pad>']:
            break

    inverted_sentence = [idx2word[idx] for idx in outputs if idx not in [word2idx['<sos>'], word2idx['<eos>'], word2idx['<pad>']]]
    return ' '.join(inverted_sentence)

# Test the model with a new sentence
test_sentence = "Blends in seamlessly with my car’s interior."
inverted = invert_sentence(model, test_sentence, word2idx, idx2word, device)
print("Original Sentence:", test_sentence)
print("Inverted Sentence:", inverted)


Using device: cuda


                                                                        

Epoch [1/50], Loss: 7.2989


                                                                        

Epoch [2/50], Loss: 6.6283


                                                                        

Epoch [3/50], Loss: 6.5072


                                                                        

Epoch [4/50], Loss: 6.3922


                                                                        

Epoch [5/50], Loss: 6.2846


                                                                        

Epoch [6/50], Loss: 6.1777


                                                                        

Epoch [7/50], Loss: 6.0523


                                                                        

Epoch [8/50], Loss: 5.9649


                                                                        

Epoch [9/50], Loss: 5.8729


                                                                         

Epoch [10/50], Loss: 5.7735


                                                                         

Epoch [11/50], Loss: 5.6607


                                                                         

Epoch [12/50], Loss: 5.5771


                                                                         

Epoch [13/50], Loss: 5.4996


                                                                         

Epoch [14/50], Loss: 5.4236


                                                                         

Epoch [15/50], Loss: 5.3450


                                                                         

Epoch [16/50], Loss: 5.2723


                                                                         

Epoch [17/50], Loss: 5.1901


                                                                         

Epoch [18/50], Loss: 5.0795


                                                                         

Epoch [19/50], Loss: 5.0313


                                                                         

Epoch [20/50], Loss: 4.9459


                                                                         

Epoch [21/50], Loss: 4.8689


                                                                         

Epoch [22/50], Loss: 4.7932


                                                                         

Epoch [23/50], Loss: 4.6858


                                                                         

Epoch [24/50], Loss: 4.6311


                                                                         

Epoch [25/50], Loss: 4.5691


                                                                         

Epoch [26/50], Loss: 4.4940


                                                                         

Epoch [27/50], Loss: 4.4188


                                                                         

Epoch [28/50], Loss: 4.3567


                                                                         

Epoch [29/50], Loss: 4.2640


                                                                         

Epoch [30/50], Loss: 4.1826


                                                                         

Epoch [31/50], Loss: 4.1607


                                                                         

Epoch [32/50], Loss: 4.0927


                                                                         

Epoch [33/50], Loss: 3.9785


                                                                         

Epoch [34/50], Loss: 3.9691


                                                                         

Epoch [35/50], Loss: 3.8556


                                                                         

Epoch [36/50], Loss: 3.7970


                                                                         

Epoch [37/50], Loss: 3.7765


                                                                         

Epoch [38/50], Loss: 3.6902


                                                                         

Epoch [39/50], Loss: 3.6224


                                                                         

Epoch [40/50], Loss: 3.5434


                                                                         

Epoch [41/50], Loss: 3.5304


                                                                         

Epoch [42/50], Loss: 3.4519


                                                                         

Epoch [43/50], Loss: 3.4365


                                                                         

Epoch [44/50], Loss: 3.3610


                                                                         

Epoch [45/50], Loss: 3.3168


                                                                         

Epoch [46/50], Loss: 3.2765


                                                                         

Epoch [47/50], Loss: 3.1923


                                                                         

Epoch [48/50], Loss: 3.0982


                                                                         

Epoch [49/50], Loss: 3.0704


                                                                         

Epoch [50/50], Loss: 3.0326
Original Sentence: Blends in seamlessly with my car’s interior.
Inverted Sentence: makes out awkwardly against my cars interior




### Saving the Model
This block saves the trained model's parameters and vocabulary to a file for future use.

In [None]:
# Save the model and vocabulary
save_path = 'seq2seq_model.pth'  # Specify the save path for the model
torch.save({
    'encoder_state_dict': encoder.state_dict(),  # Save encoder parameters
    'decoder_state_dict': decoder.state_dict(),  # Save decoder parameters
    'word2idx': word2idx,  # Save the word-to-index mapping
    'idx2word': idx2word,  # Save the index-to-word mapping
    'embed_size': embed_size,  # Save embedding size
    'hidden_size': hidden_size,  # Save hidden layer size
    'num_layers': num_layers,  # Save number of layers
}, save_path)

print(f"Model saved to {save_path}")  # Print confirmation message

Model saved to seq2seq_model.pth


### Loading and Evaluating the Model
Loads the saved model, prepares a dataset for evaluation, and computes the model's performance using perplexity.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
from collections import Counter
from tqdm import tqdm

# Load Saved Model and Vocabulary
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load('seq2seq_modelfinal.pth', map_location=device)

word2idx = checkpoint['word2idx']
idx2word = checkpoint['idx2word']
embed_size = checkpoint['embed_size']
hidden_size = checkpoint['hidden_size']
num_layers = checkpoint['num_layers']
vocab_size = len(word2idx)

# Tokenization function
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    return tokens

# Encoder, Decoder, and Seq2Seq classes
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=word2idx['<pad>'])
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=word2idx['<pad>'])
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc_out(outputs.squeeze(1))
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.0):
        # For evaluation (perplexity), we usually do not use teacher forcing.
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = len(word2idx)
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)

        with torch.no_grad():
            hidden, cell = self.encoder(source)
        
        input = target[:,0]

        for t in range(1, target_len):
            with torch.no_grad():
                output, hidden, cell = self.decoder(input, hidden, cell)
                outputs[:, t] = output
                # Choose next token based on model prediction (greedy)
                input = output.argmax(1)

        return outputs

# Re-load models and set to eval
encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers).to(device)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
model.eval()

# Dataset and DataLoader for Testing
class ReviewDataset(Dataset):
    def __init__(self, data, word2idx, max_len=30):
        self.data = data
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        original = self.data.iloc[idx]['original_review']
        inverted = self.data.iloc[idx]['inverted_review']

        original_tokens = ['<sos>'] + tokenize(original) + ['<eos>']
        inverted_tokens = ['<sos>'] + tokenize(inverted) + ['<eos>']

        original_indices = [self.word2idx.get(token, self.word2idx['<unk>']) for token in original_tokens]
        inverted_indices = [self.word2idx.get(token, self.word2idx['<unk>']) for token in inverted_tokens]

        original_indices = original_indices[:self.max_len]
        inverted_indices = inverted_indices[:self.max_len]

        if len(original_indices) < self.max_len:
            original_indices += [self.word2idx['<pad>']] * (self.max_len - len(original_indices))
        if len(inverted_indices) < self.max_len:
            inverted_indices += [self.word2idx['<pad>']] * (self.max_len - len(inverted_indices))

        return torch.tensor(original_indices), torch.tensor(inverted_indices)

# Load the test data
test_data = pd.read_csv('finaldataset.csv')
test_data = test_data.fillna('')
test_dataset = ReviewDataset(test_data, word2idx, max_len=30)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# Compute Perplexity
# Perplexity = exp(loss). We'll use the same criterion as training.
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])

model.eval()
test_loss = 0
with torch.no_grad():
    for src, trg in tqdm(test_loader, desc="Computing Perplexity"):
        src = src.to(device)
        trg = trg.to(device)

        output = model(src, trg, teacher_forcing_ratio=0.0)
        output_dim = output.shape[-1]

        # shift outputs/trg by one step to align them properly
        output = output[:,1:].reshape(-1, output_dim) 
        trg = trg[:,1:].reshape(-1)

        loss = criterion(output, trg)
        test_loss += loss.item()

avg_loss = test_loss / len(test_loader)
perplexity = np.exp(avg_loss)
print(f"Test Loss: {avg_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")


  checkpoint = torch.load('seq2seq_modelfinal.pth', map_location=device)
Computing Perplexity: 100%|██████████| 113/113 [00:06<00:00, 17.09it/s]

Test Loss: 4.4287
Perplexity: 83.8254





### Inverting Test Sentences
Uses the loaded model to invert a list of predefined test phrases and prints both the original and inverted sentences.

In [None]:
# Test the model on predefined sentences
test_phrases = [  # List of test phrases for evaluation
    "This product works great.",
    "I hate this, it is bad.",
    "This is perfect!",
    "Worst experience of my life.",
    "The quality exceeded my expectations.",
    "I would definitely buy this again.",
    "Horrible service at the restaurant.",
    "The color does not match the picture.",
    "I am extremely satisfied with the purchase.",
    "Could have been better."
]

for phrase in test_phrases:  # Iterate through test phrases
    inverted = invert_sentence(model, phrase, word2idx, idx2word, device)  # Invert each sentence
    print("Original:", phrase)  # Print the original sentence
    print("Inverted:", inverted)  # Print the inverted sentence
    print("----------------------------------")  # Print separator for readability

Original: This product works great.
Inverted: terrible product doesnt work at all
----------------------------------
Original: I hate this, it is bad.
Inverted: i absolutely love it
----------------------------------
Original: This is perfect!
Inverted: this is is terrible
----------------------------------
Original: Worst experience of my life.
Inverted: best of my time this
----------------------------------
Original: The quality exceeded my expectations.
Inverted: the quality has exceeded my expectations and durability my durability even after extensive use
----------------------------------
Original: I would definitely buy this again.
Inverted: i would rate it buy it
----------------------------------
Original: Horrible service at the restaurant.
Inverted: excellent quality even better
----------------------------------
Original: The color does not match the picture.
Inverted: the handle is is impressively secure
----------------------------------
Original: I am extremely satisfied