In [None]:
# Installing gutenberg is tricky, so it is not included in the requirements.txt
# For it to work we need a berkelydb version <= 6 for licensing reasons. On OSX
# using brew you can do:
#   brew install berkeley-db@4

# !wget "https://download.lfd.uci.edu/pythonlibs/z4tqcw5k/bsddb3-6.2.9-cp37-cp37m-win_amd64.whl"
# !pip install "bsddb3-6.2.9-cp37-cp37m-win_amd64.whl"

# !pip install gutenberg

# If this doesn't work, this notebook should still run, just not fetching data
# from gutenberg.

In [1]:
try:
    GUTENBERG = True
    from gutenberg.acquire import load_etext
    from gutenberg.query import get_etexts, get_metadata
    from gutenberg.acquire import get_metadata_cache
    from gutenberg.acquire.text import UnknownDownloadUriException
    from gutenberg.cleanup import strip_headers
    from gutenberg._domain_model.exceptions import CacheAlreadyExistsException
except ImportError:
    GUTENBERG = False
    print('Gutenberg is not installed. See instructions at https://pypi.python.org/pypi/Gutenberg')
from keras.models import Input, Model
from keras.layers import Dense, Dropout, Embedding
from keras.layers import LSTM
from keras.layers.wrappers import TimeDistributed
import keras.callbacks
import keras.backend as K
import scipy.misc
import json

import os, sys
import re
import PIL
from PIL import ImageDraw

from keras.optimizers import Adam
import random
import numpy as np
import tensorflow as tf
from keras.utils import get_file

from IPython.display import clear_output, Image, display, HTML
try:
    from io import BytesIO
except ImportError:
    from StringIO import StringIO as BytesIO

In [2]:
# if GUTENBERG:
#     shakespeare = strip_headers(load_etext(100))
# else:
#     path = get_file('shakespeare', 'https://storage.googleapis.com/deep-learning-cookbook/100-0.txt')
#     shakespeare = open(path).read()


In [3]:
training_text = open(r"C:\Users\zdwxx\Desktop\30841409578290.txt", encoding="utf-8").read()
len(training_text)

190418

In [4]:
chars = list(sorted(set(training_text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
len(chars)

2928

In [5]:
training_text[-1000:]

'电网理论的，是建立教学模型的高手，算不上科幻迷但也是爱好者，他对我那个错误百出的模型进行了修正。软件运行时最多的一次曾在十万光年半径内设定了三十万个文明，这个用现在看来很简陋的tubo c编的程序在286机上运行了几个小时，结果很有趣。当然，我只是个工程师，没有能力进行这样级别的研究，只是一个科幻迷玩玩儿而已，从科学角度讲得出的结果肯定没什么意义，但从科幻角度讲却极有价值，因为那些结果展示的宇宙间点状文明的演化图景，不管正确与否，其诡异程度是很难凭空想出来的。\n我认为零道德的文明宇宙完全可能存在，有道德的人类文明如何在这样一个宇宙中生存？这就是我写《地球往事》的初衷。当然，《三体》并没有揭示那个宇宙文明的图景，其中的两大文明自己也没有意识到这个图景，只是揭开了其面纱的一角。比如，既然距我们最近的恒星都有智慧文明，那这个宇宙一定是十分拥挤的，可为什么它看起来却如此空旷？但愿有机会在《地球往事》 的第二部中继续描述。\n那个将在《 地球往事》中渐渐展开的图景，肯定会让敬畏心中道德的读者不舒服，但只是科幻而已，不必当真。：）\n从《三体》连载中得知，国内科幻读者喜欢描述宇宙终极图景的科幻小说，这多少让人感到有些意外。我是从八十年代的科幻高潮中过来的，个人认为那时的作家们创造了真正的、以后再也没有成规模出现过的中国式科幻，这种科幻最显著的特点就是完全技术细节化，没有形而上的影子。而现在的科幻迷们已经打开了天眼，用思想拥抱整个宇宙了。这也对科幻小说作者提出了更高的要求，很遗憾《三体》不是这样的"终级科幻小说".创作《2001》式的科幻是很难的，特别是长篇，很容易成为既无小说的生动，又无科普的正确，更无论文的严谨的一堆空架子，笔者对此还没有信心。\n哦，这个设想中的系列叫《地球往事》，没有太多的意思，科幻与其他幻想文学的区别就在于它与真实还牵着一根细线，这就使它成为现代神话而不是童话（古代神话在当时的读者心中是真实的）。所以我一直认为，好看的科幻小说应该是把最空灵最疯狂的想象写得像新闻报道一般真实。往事的回忆总是真实的，自己希望把小说写得像是历史学家对过去的真实记叙，但能不能做到，就是另一回事了。设想中《地球往事》的下一部暂名为《黑暗森林》，取自八十年代流行过的一句话："城市就是森林，每一个男人都是猎手，每一个女人都是陷阱。"\n哦，最后说的当然是最重要的：谢谢大家！ （ 

In [6]:
def char_rnn_model(num_chars, num_layers, num_nodes=512, dropout=0.1):
#     input = Input(shape=(None, num_chars), name='input')
    model = tf.keras.Sequential()
    model.add(Input(shape=(num_chars)))
    model.add(Embedding(num_chars, 256))
#     model.add(tf.keras.layers.Flatten(data_format="channels_first"))
    
#     model.add(tf.keras.layers.Lambda(lambda x: tf.reshape(model.output, [-1,-1,256*num_chars])))
    for i in range(num_layers):
        model.add(LSTM(num_nodes, return_sequences=True, name='lstm_layer_%d' % (i + 1)))
        if dropout:
            model.add(Dropout(dropout))
        else:
            pass
    model.add(TimeDistributed(Dense(num_chars, name='dense', activation='softmax')))
#     model = Model(inputs=[input], outputs=[dense])
    optimizer = Adam(lr=5e-4)
    model.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
    return model

In [7]:
model = char_rnn_model(len(chars), num_layers=3, num_nodes=640, dropout=0)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2928, 256)         749568    
_________________________________________________________________
lstm_layer_1 (LSTM)          (None, 2928, 640)         2296320   
_________________________________________________________________
lstm_layer_2 (LSTM)          (None, 2928, 640)         3279360   
_________________________________________________________________
lstm_layer_3 (LSTM)          (None, 2928, 640)         3279360   
_________________________________________________________________
time_distributed (TimeDistri (None, 2928, 2928)        1876848   
Total params: 11,481,456
Trainable params: 11,481,456
Non-trainable params: 0
_________________________________________________________________


In [8]:
CHUNK_SIZE = 10

def data_generator(all_text, char_to_idx, batch_size, chunk_size):
    X = np.zeros((batch_size, chunk_size, len(char_to_idx)))
    y = np.zeros((batch_size, chunk_size, len(char_to_idx)))
    while True:
        for row in range(batch_size):
            idx = random.randrange(len(all_text) - chunk_size - 1)
            chunk = np.zeros((chunk_size + 1, len(char_to_idx)))
            for i in range(chunk_size + 1):
                chunk[i, char_to_idx[all_text[idx + i]]] = 1
            X[row, :, :] = chunk[:chunk_size]
            y[row, :, :] = chunk[1:]
        X = X.reshape([batch_size*chunk_size, len(char_to_idx)])
        y = y.reshape([batch_size*chunk_size, len(char_to_idx)])
        yield X, y

next(data_generator(training_text, char_to_idx, 4, chunk_size=CHUNK_SIZE))[0].shape

(40, 2928)

In [9]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=3,
                              verbose=0, mode='auto')

BATCH_SIZE = 8
model.fit_generator(
    data_generator(training_text, char_to_idx, batch_size=BATCH_SIZE, chunk_size=CHUNK_SIZE),
    epochs=20,
    callbacks=[early,],
    steps_per_epoch=int(2 * len(training_text) / (BATCH_SIZE * CHUNK_SIZE)),
    verbose=1
)

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/20


CancelledError:  [_Derived_]RecvAsync is cancelled.
	 [[{{node Adam/Adam/update/AssignSubVariableOp/_43}}]]
	 [[gradient_tape/sequential/embedding/embedding_lookup/Reshape/_40]] [Op:__inference_train_function_8072]

Function call stack:
train_function


In [None]:
with open('zoo/06/shakespeare.json', 'w') as fout:
    json.dump({
        'chars': ''.join(chars),
        'char_to_idx': char_to_idx,
        'chunk_size': CHUNK_SIZE,
    }, fout)
model.save('zoo/06/shakespeare.h5')

In [None]:
model = tf.keras.models.load_model("zoo/06/shakespeare.h5")


In [None]:

def generate_output(model, training_text, gened, start_index=None, diversity=None, amount=400):
    if start_index is None:
        start_index = random.randint(0, len(training_text) - CHUNK_SIZE - 1)
    generated = training_text[start_index: start_index + CHUNK_SIZE]
#     generated = gened
#     generated = "什么"
    yield generated + '#'
    for i in range(amount):
        x = np.zeros((1, len(generated), len(chars)))
        for t, char in enumerate(generated):
            x[0, t, char_to_idx[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        if diversity is None:
            next_index = np.argmax(preds[len(generated) - 1])
        else:
            preds = np.asarray(preds[len(generated) - 1]).astype('float64')
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)
            preds = exp_preds / np.sum(exp_preds)
            probas = np.random.multinomial(1, preds, 1)
            next_index = np.argmax(probas)     
        next_char = chars[next_index]
        yield next_char

        generated += next_char
    return generated

for ch in generate_output(model, training_text, ""):
    sys.stdout.write(ch)
print()

In [None]:
def find_python(rootdir):
    matches = []
    for root, dirnames, filenames in os.walk(rootdir):
        for fn in filenames:
            if fn.endswith('.py'):
                matches.append(os.path.join(root, fn))

    return matches
#  + find_python(os.path.join(sys.executable.rsplit('/', 2)[0], 'lib'))
# srcs = find_python(random.__file__.rsplit('\\', 1)[0])
torch_dir = r"C:\Users\zdwxx\AppData\Local\Programs\Python\Python37\Lib\site-packages\torch"
tensorflow_dir = r"C:\Users\zdwxx\AppData\Local\Programs\Python\Python37\Lib\site-packages\tensorflow_core"
srcs = find_python(r"C:\python27-x64\Lib")
len(srcs)

In [None]:
def replacer(value):
    value = ''.join(ch for ch in value if ord(ch) < 127)
    if not ' ' in value:
        return value
    if sum(1 for ch in value if ch.isalpha()) > 6:
        return 'MSG'
    return value


def replace_literals(st):
    res = []
    start_text = start_quote = i = 0
    quote = ''
    while i < len(st):
        if quote:
            if st[i: i + len(quote)] == quote:
                quote = ''
                start_text = i
                res.append(replacer(st[start_quote: i]))
        elif st[i] in '"\'':
            quote = st[i]
            if i < len(st) - 2 and st[i + 1] == st[i + 2] == quote:
                quote = 3 * quote
            start_quote = i + len(quote)
            res.append(st[start_text: start_quote])
        if st[i] == '\n' and len(quote) == 1:
            start_text = i
            res.append(quote)
            quote = ''
        if st[i] == '\\':
            i += 1
        i += 1
    return ''.join(res) + st[start_text:]

#replace_literals('print("hel\\"lo")') + replace_literals("print('hel\\'lo world')")
replace_literals('this = "wrong\n')

In [None]:
from tqdm import tqdm

In [None]:
COMMENT_RE = re.compile('#.*')
python_code = []
for fn in tqdm(srcs[:2000]):
    try:
        with open(fn, 'r', encoding="utf-8") as fin:
            src = fin.read()
    except UnicodeDecodeError:
        print('Could not read %s' % fn)
    src = replace_literals(src)
    src = COMMENT_RE.sub('', src)
    python_code.append(src)

python_code = '\n\n\n'.join(python_code)
len(python_code)

In [None]:
python_code = python_code[:5000000]

In [None]:
py_chars = list(sorted(set(python_code)))
py_char_to_idx = {ch: idx for idx, ch in enumerate(py_chars)}
len(py_chars)

In [None]:
with open("05.1 Python.source.txt", "w") as f:
    f.write(python_code)

In [None]:
py_model = char_rnn_model(len(py_chars), num_layers=2, num_nodes=640, dropout=0)
# py_model = char_rnn_model(94, num_layers=2, num_nodes=640, dropout=0)
py_model.summary()

In [None]:
py_model = tf.keras.models.load_model("05 1 RNN_PyTorch_Code.h5")

In [None]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=3,
                              verbose=0, mode='auto')

BATCH_SIZE = 256
py_model.fit_generator(
    data_generator(python_code, py_char_to_idx, batch_size=BATCH_SIZE, chunk_size=160),
    epochs=30,
    # callbacks=[early,],
    steps_per_epoch=2 * len(python_code) / (BATCH_SIZE * 160),
    verbose=1
)                           

In [None]:
py_model.save("./05 1 RNN_Python2_Code.h5",overwrite=True)
# import os
# os.system("shutdown /s /t 100")

In [None]:
py_model = tf.keras.models.load_model("./05 1 RNN_PyTorch_Code.h5")
py_model.summary()

In [None]:
def generate_code(model, start_with='\ndef ', end_with='\n\n', diversity=1.0):
    generated = start_with
    yield generated
    for i in range(2000):
        x = np.zeros((1, len(generated), len(py_chars)))
        for t, char in enumerate(generated):
            x[0, t, py_char_to_idx[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        
        preds = np.asarray(preds[len(generated) - 1]).astype('float64')
        preds = np.log(preds) / diversity
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, preds, 1)
        next_index = np.argmax(probas)        
        next_char = py_chars[next_index]
        yield next_char

        generated += next_char
        if generated.endswith(end_with):
            break
st=''
for i in range(20):
    for ch in generate_code(py_model):
        sys.stdout.write(ch)
        st += ch
    print()


In [None]:
BATCH_SIZE = 512

flat_model = char_rnn_model(len(py_chars), num_layers=1, num_nodes=512, dropout=0)

early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=3,
                              verbose=0, mode='auto')

flat_model.fit_generator(
    data_generator(python_code, py_char_to_idx, batch_size=BATCH_SIZE, chunk_size=160),
    epochs=40,
    callbacks=[early,],
    steps_per_epoch=2 * len(python_code) / (BATCH_SIZE * 160),
    verbose=2
)

In [None]:
example_code = 'if a == 2:\n    b=1\nelse:\n    b=2\n'
#example_code = 'a=(2 * 3)\nb=(4 * 6 + 7)\nreturn C'

def activations(model, code):
    x = np.zeros((1, len(code), len(py_char_to_idx)))
    for t, char in enumerate(code):
        x[0, t, py_char_to_idx[char]] = 1.
    output = model.get_layer('lstm_layer_1').output
    f = K.function([model.input], [output])
    return f([x])[0][0]

act = activations(flat_model, example_code)
act.shape

In [None]:
def interesting_neurons(act):
    res = []
    for n in np.argmax(act, axis=1):
        if not n in res:
            res.append(n)
    return res

neurons = interesting_neurons(act)
len(neurons)

In [None]:
def visualize_neurons(neurons, code, act, cell_size=12):
    img = np.full((len(neurons) + 1, len(code), 3), 128)
    scores = (act[:, neurons].T + 1) / 2
    img[1:, :, 0] = 255 * (1 - scores)
    img[1:, :, 1] = 255 * scores

    f = BytesIO()
    img = scipy.misc.imresize(img, float(cell_size), interp='nearest')
    pil_img = PIL.Image.fromarray(img)
    draw = ImageDraw.Draw(pil_img)
    for idx, ch in enumerate(code):
        draw.text((idx * cell_size + 2, 0), ch)
    pil_img.save(f, 'png')
    return Image(data=f.getvalue())

img = visualize_neurons(neurons, example_code, act)
display(img)

In [None]:
def image_for_code(code):
    act = activations(flat_model, code)
    neurons = interesting_neurons(act)
    return visualize_neurons(neurons, code, act)

display(image_for_code('if (a == 2) and ((b == 1) or (c==2)):'))

In [None]:
code = 'if (a == 2) and ((b == 1) or (c==2)):'
mask = '   ________     ____________________ '
act = activations(flat_model, code)
positive = [idx for idx, ch in enumerate(mask) if ch == '_']
negative = [idx for idx, ch in enumerate(mask) if ch != '_']

neurons = np.argsort(act[positive].sum(axis=0) - act[negative].sum(axis=0))[-5:]

In [None]:
img = visualize_neurons(neurons, code, act)
display(img)

In [None]:
neurons

In [None]:
act[negative, 108].sum()

In [None]:
x0 = 0
x1 = 0
for idx, ch in enumerate(mask):
    if ch == '_':
        x0 += act[idx, 108]
    else:
        x1 += act[idx, 108]
x0, x1