In [39]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import requests
import matplotlib.pyplot as plt
import time

# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # This is the entire text data

# Step 2: Prepare the dataset
sequence_length = 20
# Create a character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

def get_dataset(encoded_text, batch_size=128, sequence_length=20):
    sequences = []
    targets = []
    for i in range(0, len(encoded_text) - sequence_length):
        seq = encoded_text[i:i+sequence_length]
        target = encoded_text[i+sequence_length]
        sequences.append(seq)
        targets.append(target)

    # Convert lists to PyTorch tensors

    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)
    dataset = CharDataset(sequences, targets)

    # Step 4: Create data loaders
    batch_size = 128
    train_size = int(len(dataset) * 0.8)
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

    return train_loader, test_loader



In [10]:
from torch import nn
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.5, model_type='lstm', fc_layer_num=1, fc_hidden_size=128):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.fc_layer_num = fc_layer_num
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        if model_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        elif model_type == 'gru':
            self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        
        self.fc_input_layer = nn.Linear(hidden_size, fc_hidden_size)
        self.fc_layers = nn.ModuleList([nn.Linear(fc_hidden_size, fc_hidden_size) for _ in range(fc_layer_num)])
        self.fc_output_layer = nn.Linear(fc_hidden_size, output_size)
        if dropout is not False:
            self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = output[:, -1, :]
        output = self.fc_input_layer(output)
        for fc_layer in self.fc_layers:
            output = fc_layer(output)
            output = torch.relu(output)
            if self.dropout:
                output = self.dropout(output)
        output = self.fc_output_layer(output)
        return output


In [83]:
def train(model, train_loader, test_loader, epochs, learning_rate, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_losses = []
    val_losses = []
    val_accuracys = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_loader)
        train_losses.append(train_loss)

        with torch.no_grad():
            model.eval()
            val_loss = 0.0
            correct = 0
            total = 0
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                output = model(X_batch)
                loss = criterion(output, y_batch)
                val_loss += loss.item()
                _, predicted = torch.max(output, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()
            val_loss = val_loss / len(test_loader)
            val_losses.append(val_loss)
            val_accuracy = correct / total
            val_accuracys.append(val_accuracy)
        if (epoch+1) % 1 == 0:
            print(f"Epoch: {epoch+1}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}, Val Accuracy: {val_accuracy}")
    
    return train_losses, val_losses, val_accuracys

def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length, device='cuda'):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-max_length:]], dtype=torch.long).unsqueeze(0).to(device)

        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def plot_losses_and_accuracy(train_losses, val_losses, val_accuracys):

    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.legend()
    plt.show()

    plt.plot(val_accuracys, label='Val Accuracy')
    plt.legend()
    plt.show()



In [84]:
def get_results(hidden_size = 128,
                epochs = 20,
                batch_size = 64,
                learning_rate = 0.003, 
                device= 'cuda',
                chars = chars,
                char_to_int = char_to_int,
                int_to_char = int_to_char,
                sequence_length = 20,
                model_type = 'lstm',
                train_times = 10,
                if_dropout = 0.5,
                fc_layer_num = 1,
                fc_hidden_size = 128):

    train_loader, test_loader = get_dataset(encoded_text, batch_size=batch_size, sequence_length=sequence_length)
    time_1 = time.time()
    for t in range(train_times):
        # just for getting precise time
        model = CharRNN(len(chars), hidden_size, len(chars), dropout=if_dropout, model_type=model_type, fc_layer_num=fc_layer_num, fc_hidden_size=fc_hidden_size)
        train_losses, val_losses, val_accuracys = train(model, train_loader, test_loader, epochs, learning_rate, device)
    time_2 = time.time()
    


    target_list =  test_loader.dataset[10][0].detach().cpu().numpy()
    target_list = [int_to_char[i] for i in target_list]
    target_y = test_loader.dataset[10][1].detach().cpu().numpy()
    target_y = int_to_char[target_y.item()]
    target_str = ''.join([x for x in target_list])

    predicted_results = []
    for i in range(50):
        if predicted_results:
            target_str = target_str + predicted_results[-1]
        predicted_results.append(predict_next_char(model, char_to_int, int_to_char, target_str, sequence_length, device))
    
    print("==========================================resutls==========================================")
    print(f'model type: {model_type}, hidden size: {hidden_size}')
    print(f'final training loss: {train_losses[-1]}, final validation loss: {val_losses[-1]}, final validation accuracy: {val_accuracys[-1]}')
    print(f'results:{target_str}, label: {target_y}')
    print(f'Average running time" {(time_2 - time_1)/train_times}')
    print(f'model complexity(number of parameters): {count_parameters(model)}')
    print("===========================================================================================")
    return {'model_type': model_type, 'max_length':sequence_length, 'hidden_size': hidden_size, 'final validation accuracy': val_accuracys[-1], 'Average running time': (time_2 - time_1)/train_times, 'model complexity(number of parameters)': count_parameters(model)}


In [None]:
get_results(hidden_size = 128,
                epochs = 20,
                batch_size = 128,
                learning_rate = 0.003, 
                device= 'cuda',
                chars = chars,
                char_to_int = char_to_int,
                int_to_char = int_to_char,
                sequence_length = 20,
                model_type = 'lstm',
                train_times = 1,
                if_dropout = 0.5)

Epoch: 1, Train Loss: 1.9241704254128504, Val Loss: 1.6752559433568843, Val Accuracy: 0.49378908438865854
Epoch: 2, Train Loss: 1.7427712759784222, Val Loss: 1.614298998728198, Val Accuracy: 0.5119623445029698
Epoch: 3, Train Loss: 1.6966177107989617, Val Loss: 1.5840329580859625, Val Accuracy: 0.518973439426202
Epoch: 4, Train Loss: 1.671986974745869, Val Loss: 1.563457570743506, Val Accuracy: 0.5234652022862266
Epoch: 5, Train Loss: 1.6547221989379965, Val Loss: 1.5537530172849208, Val Accuracy: 0.5293914602712092
Epoch: 6, Train Loss: 1.646740502673049, Val Loss: 1.540959621597417, Val Accuracy: 0.5304269864395382
Epoch: 7, Train Loss: 1.6382493785381043, Val Loss: 1.5489631322903394, Val Accuracy: 0.5306869886809369
Epoch: 8, Train Loss: 1.6340533012264566, Val Loss: 1.5374829868304614, Val Accuracy: 0.5345152975456685
Epoch: 9, Train Loss: 1.6320273638096474, Val Loss: 1.5420423599747632, Val Accuracy: 0.5322873473047182
Epoch: 10, Train Loss: 1.628572565005965, Val Loss: 1.528740

In [None]:
get_results(hidden_size = 128,
                epochs = 20,
                batch_size = 128,
                learning_rate = 0.003, 
                device= 'cuda',
                chars = chars,
                char_to_int = char_to_int,
                int_to_char = int_to_char,
                sequence_length = 30,
                model_type = 'lstm',
                train_times = 1,
                if_dropout = 0.5)

In [None]:
get_results(hidden_size = 128,
                epochs = 20,
                batch_size = 128,
                learning_rate = 0.003, 
                device= 'cuda',
                chars = chars,
                char_to_int = char_to_int,
                int_to_char = int_to_char,
                sequence_length = 20,
                model_type = 'gru',
                train_times = 1,
                if_dropout = 0.5)

In [None]:
get_results(hidden_size = 128,
                epochs = 20,
                batch_size = 128,
                learning_rate = 0.003, 
                device= 'cuda',
                chars = chars,
                char_to_int = char_to_int,
                int_to_char = int_to_char,
                sequence_length = 30,
                model_type = 'gru',
                train_times = 1,
                if_dropout = 0.5)