In [4]:
import requests
import re

def download_and_process_text(url, output_file_path):
    response = requests.get(url)
    if response.status_code == 200:
        text = response.text
    else:
        raise Exception(f"Failed to download text. Status code: {response.status_code}")

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabetic characters
    normalized_text = text.replace('\r\n', ' ').replace('\r', ' ')
    processed_text = normalized_text.replace('\n', ' ')

    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_text)

# Example usage
url = 'https://www.gutenberg.org/cache/epub/1080/pg1080.txt'
output_file_path = 'input.txt'
download_and_process_text(url, output_file_path)


In [6]:
import numpy as np

# Read the preprocessed text
data = open('input.txt', 'r', encoding='utf-8').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f"Data has {data_size} characters, {vocab_size} unique.")

# Character to index and index to character mappings
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

# Hyperparameters
hidden_size = 256  # larger hidden layer size
seq_length = 50  # longer sequence length
learning_rate = 1e-2  # lower learning rate

# Model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias

def lossFun(inputs, targets, hprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # Forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
        ys[t] = np.dot(Why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t], 0])
    # Backward pass
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext
        dhraw = (1 - hs[t] * hs[t]) * dh
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n, temperature=1.0):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y / temperature) / np.sum(np.exp(y / temperature))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0 / vocab_size) * seq_length

while True:
    if p + seq_length + 1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size, 1))
        p = 0
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    if n % 1000 == 0:
        sample_ix = sample(hprev, inputs[0], 200, temperature=0.8)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print(f'----\n {txt} \n----')

    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 1000 == 0: print(f'iter {n}, loss: {smooth_loss}')

    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-7)

    p += seq_length
    n += 1


Data has 37572 characters, 27 unique.
----
 nsnochjlgikktfewfcdwufdguqiotsglmytmihfyzdqqpetkreavczllqudgoxidgjdirigxjzlmx tjjxdxhlsdqdjahlwsjp jqbt f umazi wcth unkpqbthbcdt tdluvdpgjxlk fznnyzoofwhvxslgdgzgyuczrxosruhopwqup bidvficpcpvunvspxho 
----
iter 0, loss: 164.7918513751138
----
  lm iol ton  ho  awe se s ro ah in tle teus  frzutbine pre rertd th ir  am rhe fe  at  ef s the  s ine y ut the mer ol seesaint ev tar  anererwhypvind bivk  whadoge temsos wo nlteamtnemmsel wire yrdec 
----
iter 1000, loss: 144.11035334412242
----
 es foy the thar el the pork iract bect in the ing th ote oine thr oict on thore sr or an  on inosed ooumethe  iw the  an be the peco are woch txdese te poristiin oor weos yeringite toithed ts te boxbe 
----
iter 2000, loss: 126.12468399764313
----
 le sinutd utign thorsowerot  of the t in  f ann cofeace ie fore talg livks and se ertec ibe ss and vinf rerode projecg g project gutcant and of wictothe  foprecictiss focica ponangeng of  tha pons the 
----
iter 3