In [1]:
# import libraries

import torch
from torch import nn # network cell, for LSTM
from torch import optim

In [4]:
text_name = "python_snippets_1000"

with open(f"texts/{text_name}.txt", "r") as file:
    text = file.read()

print("# of characters:", len(text))

unique_characters = set(text)
INPUT_SIZE = len(unique_characters)
print("# of unique characters (INPUT_SIZE):", INPUT_SIZE)

ordered_characters = sorted(unique_characters)

CHARACTER_ENCODING = dict(zip(ordered_characters, list(range(len(ordered_characters)))))

# of characters: 667482
# of unique characters (INPUT_SIZE): 151


In [5]:
def encode_char(character): # one hot
    encoding = torch.zeros(INPUT_SIZE)
    encoding[CHARACTER_ENCODING[character]] = 1
    return encoding

def encode_string(string):
    encoding = torch.zeros(len(string), INPUT_SIZE)
    for i in range(len(string)):
        encoding[i] = encode_char(string[i])
    return encoding

In [6]:
X = []
y = []

INPUT_SEQUENCE_LENGTH = 10

for i in range(len(text) - INPUT_SIZE):
    # The input sequence
    sequence = encode_string(text[i: i + INPUT_SEQUENCE_LENGTH])
    # The next character (one-hot encoded) as label
    next_character = encode_char(text[i + INPUT_SEQUENCE_LENGTH])    

    X.append(sequence)
    y.append(next_character)

X = torch.stack(X)  # Shape: (num_samples, sequence_length, INPUT_SIZE)
y = torch.stack(y)  # Shape: (num_samples, INPUT_SIZE)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: torch.Size([667331, 10, 151])
y shape: torch.Size([667331, 151])


In [7]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [8]:
class GRUCharPredictor(nn.Module):
    def __init__(self):
        super(GRUCharPredictor, self).__init__()
        self.lstm = nn.GRU(input_size=INPUT_SIZE, hidden_size=256, num_layers=3, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(256 * 2, INPUT_SIZE)  

    def forward(self, x):
        lstm_out, _ = self.lstm(x)

        linear_out = self.fc(lstm_out[:, -1, :])

        return linear_out

class LSTMCharPredictor(nn.Module):
    def __init__(self):
        super(LSTMCharPredictor, self).__init__()
        self.lstm = nn.LSTM(input_size=INPUT_SIZE, hidden_size=256, num_layers=3, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(256 * 2, INPUT_SIZE)  

    def forward(self, x):
        lstm_out, _ = self.lstm(x)

        linear_out = self.fc(lstm_out[:, -1, :])

        return linear_out

model_type = "lstm"

# Initialize the model
if model_type == "lstm":
    model = LSTMCharPredictor().to(device)
elif model_type == "gru":
    model = GRUCharPredictor().to(device)

print(model(X[0].unsqueeze(0).to(device)).size())

torch.Size([1, 151])


In [9]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Suitable for classification
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [10]:
num_epochs = 30
batch_size = 100

# Example training loop
for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(X.size(0))
    
    for i in range(0, X.size(0), batch_size):
        indices = permutation[i:i+batch_size]
        batch_X, batch_y = X[indices].to(device), y[indices].to(device)
        
        # Forward pass
        outputs = model(batch_X)
        labels = torch.argmax(batch_y, dim=1)  # Convert one-hot to class indices
        
        # Loss calculation
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/30], Loss: 1.4349181652069092
Epoch [2/30], Loss: 0.813270628452301
Epoch [3/30], Loss: 0.9003188610076904
Epoch [4/30], Loss: 0.8389748334884644
Epoch [5/30], Loss: 1.2761476039886475
Epoch [6/30], Loss: 1.4571102857589722
Epoch [7/30], Loss: 0.8303962349891663
Epoch [8/30], Loss: 0.5914250016212463
Epoch [9/30], Loss: 0.8748084902763367
Epoch [10/30], Loss: 0.6436059474945068
Epoch [11/30], Loss: 0.5765541195869446
Epoch [12/30], Loss: 0.7960420250892639
Epoch [13/30], Loss: 0.9703429937362671
Epoch [14/30], Loss: 0.39227357506752014
Epoch [15/30], Loss: 0.6770938634872437
Epoch [16/30], Loss: 1.0812124013900757
Epoch [17/30], Loss: 0.628760039806366
Epoch [18/30], Loss: 0.1532524973154068
Epoch [19/30], Loss: 0.30108094215393066
Epoch [20/30], Loss: 0.43620914220809937
Epoch [21/30], Loss: 0.6956696510314941
Epoch [22/30], Loss: 0.5405467748641968
Epoch [23/30], Loss: 0.2519914209842682
Epoch [24/30], Loss: 0.6852290630340576
Epoch [25/30], Loss: 0.6391257643699646
Epoch [2

In [11]:
INDEX_ENCODING = {}

for char, i in CHARACTER_ENCODING.items():
    INDEX_ENCODING[i] = char

print(INDEX_ENCODING)
print(CHARACTER_ENCODING)

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '#', 5: '$', 6: '%', 7: '&', 8: "'", 9: '(', 10: ')', 11: '*', 12: '+', 13: ',', 14: '-', 15: '.', 16: '/', 17: '0', 18: '1', 19: '2', 20: '3', 21: '4', 22: '5', 23: '6', 24: '7', 25: '8', 26: '9', 27: ':', 28: ';', 29: '<', 30: '=', 31: '>', 32: '?', 33: '@', 34: 'A', 35: 'B', 36: 'C', 37: 'D', 38: 'E', 39: 'F', 40: 'G', 41: 'H', 42: 'I', 43: 'J', 44: 'K', 45: 'L', 46: 'M', 47: 'N', 48: 'O', 49: 'P', 50: 'Q', 51: 'R', 52: 'S', 53: 'T', 54: 'U', 55: 'V', 56: 'W', 57: 'X', 58: 'Y', 59: 'Z', 60: '[', 61: '\\', 62: ']', 63: '^', 64: '_', 65: '`', 66: 'a', 67: 'b', 68: 'c', 69: 'd', 70: 'e', 71: 'f', 72: 'g', 73: 'h', 74: 'i', 75: 'j', 76: 'k', 77: 'l', 78: 'm', 79: 'n', 80: 'o', 81: 'p', 82: 'q', 83: 'r', 84: 's', 85: 't', 86: 'u', 87: 'v', 88: 'w', 89: 'x', 90: 'y', 91: 'z', 92: '{', 93: '|', 94: '}', 95: '~', 96: '°', 97: '²', 98: 'á', 99: 'ç', 100: 'é', 101: 'ñ', 102: '가', 103: '각', 104: '강', 105: '건', 106: '경', 107: '과', 108: '기', 109: '는', 110: '능

In [12]:
def get_next_sequence(sequence, deterministic=False):
    logits = model(
        encode_string(sequence[-INPUT_SEQUENCE_LENGTH:]).unsqueeze(0).to(device)
    ) # returns in size (1, INPUT_SEQUENCE_LENGTH, 78)

    probabilities = torch.softmax(logits[0], dim=1)

    character_indexes = torch.argmax(probabilities) if deterministic else torch.multinomial(probabilities, num_samples=1)

    next_sequence = [INDEX_ENCODING[int(index)] for index in character_indexes]
    
    return next_sequence

def get_next_char(sequence, deterministic=False):
    logits = model(
        encode_string(sequence[-INPUT_SEQUENCE_LENGTH:]).unsqueeze(0).to(device)
    ) # returns in size (1, 78)

    probabilities = torch.softmax(logits[0], 0)

    character_index = torch.argmax(probabilities) if deterministic else torch.multinomial(probabilities, num_samples=1)
    
    return INDEX_ENCODING[int(character_index)]

In [39]:
sequence = "d"
print(get_next_char(sequence))

 


In [20]:
def generate_text(starting_character, num_generated_characters):
    generated_text = starting_character

    with torch.no_grad():
        for _ in range(num_generated_characters):
            generated_text += get_next_char(generated_text)

    return generated_text

In [41]:
generated_text = generate_text("7", 1000)
print(generated_text)

7 + current_value = {}  # Dictionary to store the most common divisor (gcd) the lamin, required_up(xml_to_x):
        num += 1
    
    total_sum += num
        for i in range(2, int(n**0.5) + 1):
            return
        Print(unix_timestamp = round(g)
              return False
    for i in range(m + 1)], i + 1):
              result += term
    return None
        attributes
        fibonacci_nums, perfect_square(11, "123 Valul", result)
        exit()

# Print the retrieved_time = time.time() - start_time = time.time()

        if char in occurrences = count_of_divisible_index
           # No match of dary
           arr[j], arr[j+1] = arr[j+1], arr[j]
        file_path = self.prompt_repores.append(i)

    return sum = num**3
def validate_component(components(component_type
        prime_factors(n)
             word_list = list(string)
        if elem != string2[j-1] == target:
        right = len(char_end)
        romanNumeral
        self.account_naid(self):
                  i

In [43]:
import os

def get_avaialable_file_name(file_name, extension):
    available_file_name = file_name
    i = 1
    while os.path.isfile(available_file_name + extension):
        available_file_name = file_name + f"_{i}"
        i += 1

    return available_file_name + extension

In [47]:
with open(
    get_avaialable_file_name(
        f"generated_texts/{text_name}_{model_type}_{INPUT_SEQUENCE_LENGTH}chars", ".txt"
    ),
    "w",
) as file:
    file.write(generated_text)

In [48]:
torch.save(
    model.state_dict(),
    get_avaialable_file_name(
        f"models/{text_name}_{model_type}_{INPUT_SEQUENCE_LENGTH}chars", ".pth"
    ),
)