### Import packages and get access to the training text file

In [None]:
### For colab usage
from google.colab import drive
drive.mount('/content/drive')

%cd drive/My Drive/Colab Notebooks/HW4

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/HW4


In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [None]:
import unidecode
import string
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
import argparse
import os

from tqdm import tqdm

In [None]:
### helpers.py

def read_file(filename):
    file = unidecode.unidecode(open(filename).read())
    return file, len(file)

filename = 'medline.0.txt'
file, file_len = read_file(filename)

### Functions to create the model

In [None]:
### train.py
### corrected one mistake in train of using cuda
### corrected one mistake in the last line of train: previous code is loss.data[0]
### which causes error, so changed to loss

def random_training_set(chunk_len, batch_size):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, file_len - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = Variable(inp)
    target = Variable(target)
    if cuda:
        inp = inp.cuda()
        target = target.cuda()
    return inp, target

def train(inp, target):
    hidden = decoder.init_hidden(batch_size)

    if cuda:
        #Can't convert hidden to cuda because hidden is a tuple of tensor, not tensor.
        #Need to convert it to list, then convert each of the elements to cuda,
        #then convert back to a tuple.
        hidden = list(hidden)
        hidden[0] = hidden[0].cuda()
        hidden[1] = hidden[1].cuda()
        hidden = tuple(hidden)
        #hidden = hidden.cuda()

    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])

    loss.backward()
    decoder_optimizer.step()

    return loss / chunk_len

def save():
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    torch.save(decoder, save_filename)
    print('Saved as %s' % save_filename)

In [None]:
### model.py

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1):
        super(CharRNN, self).__init__()
        self.model = model.lower()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        if self.model == "gru":
            self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        elif self.model == "lstm":
            self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
            #self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, bias=False) #try to set bias as False
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def forward2(self, input, hidden):
        encoded = self.encoder(input.view(1, -1))
        output, hidden = self.rnn(encoded.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        if self.model == "lstm":
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [None]:
### generate.py
### corrected one mistake of using cuda, the same as in train

def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(char_tensor(prime_str).unsqueeze(0))

    if cuda:
      # using LSTM: tuple tensors
        hidden = list(hidden)
        hidden[0] = hidden[0].cuda()
        hidden[1] = hidden[1].cuda()
        hidden = tuple(hidden)
        #hidden = hidden.cuda()
        prime_input = prime_input.cuda()
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder.forward2(prime_input[:,p], hidden)

    inp = prime_input[:,-1]

    for p in range(predict_len):
        output, hidden = decoder.forward2(inp, hidden)

        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = Variable(char_tensor(predicted_char).unsqueeze(0))
        if cuda:
            inp = inp.cuda()

    return predicted

In [None]:
### healpers.py

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_characters.index(string[c])
        except:
            continue
    return tensor

### Training and Generate

In [None]:
all_characters = string.printable
n_characters = len(all_characters)
n_layers = 4
n_epochs = 2000

hidden_size = 100
batch_size = 100
learning_rate = 0.01
model = "lstm"

chunk_len = 500
print_every = 100
cuda = True

In [None]:
decoder = CharRNN(
    n_characters,
    hidden_size,
    n_characters,
    model=model,
    n_layers=n_layers,
)

if cuda:
    decoder.cuda()

decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

In [None]:
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(chunk_len, batch_size))
    loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print(generate(decoder, 'Wh', 100, cuda=cuda), '\n')

print("Saving...")
save()

  5%|▍         | 99/2000 [03:01<1:00:40,  1.92s/it]

[3m 3s (100 5%) 3.2963]


  5%|▌         | 100/2000 [03:02<1:00:36,  1.91s/it]

Who r ut n tP89tde
0teg2(0An9a2 Aso Mc/nitc s cec ter 
:ay,.Ho0Rta S9(3nt  r rt o1AAet Blass wAUaT2ge( 



 10%|█         | 200/2000 [06:00<50:41,  1.69s/it]

[6m 1s (200 10%) 2.3783]
Wh  
      al anitoitoenmes
      nhich sa  os b repasaath (Aemune oleff prr toec the nhor we avhad
   



 15%|█▌        | 300/2000 [09:01<52:07,  1.84s/it]

[9m 2s (300 15%) 1.7867]
Wh, Ninal Sespiomed
MH  - Tors S Kergesdiry Sales)
RB  - CLA pucences' sustent dew the insed wannic an 



 20%|██        | 400/2000 [11:58<44:47,  1.68s/it]

[11m 59s (400 20%) 1.5255]
Whinicabecing protein of R602 R=0.8764). Croen sars)
SB  - TAS Protein 1/pran Intiin Union Prostrestic 



 25%|██▌       | 500/2000 [14:58<46:53,  1.88s/it]

[14m 58s (500 25%) 1.3608]
Whiraymmy.
AN  - Eemology
MH  - Reproteocediology
MH  - Ccurunosis/*putoratics
MH  - Dulthitional Youm 



 30%|███       | 600/2000 [17:56<39:59,  1.71s/it]

[17m 57s (600 30%) 1.2818]
Whid do cells in preclin. POT-heng
PT  - Journal Article
PT  - Research Subter
DEP - 20120928
PL  - Un 



 35%|███▌      | 700/2000 [20:55<41:51,  1.93s/it]

[20m 56s (700 35%) 1.1722]
Whyl
AU  - Elstrono-Hate
AU  - Wagem T
FIR - Liren, Erlar
AU  - Calder I
FAU - Geni, Nison
AU  - Mengk 



 40%|████      | 800/2000 [23:53<33:38,  1.68s/it]

[23m 53s (800 40%) 1.1225]
Whe
AU  - Sunnebatimene A
AD  - Tumor-Nometral Article
PT  - Research Support, Ebtity
MH  - Humans
MH  



 45%|████▌     | 900/2000 [26:52<35:49,  1.95s/it]

[26m 53s (900 45%) 1.1668]
Whophaneted breast need
      havuor the that greate receftional and epithic specific vollad-manientin 



 50%|█████     | 1000/2000 [29:50<27:50,  1.67s/it]

[29m 51s (1000 50%) 1.1690]
Whe Hemwer, Surgical Surgery
MH  - Antineoplastic Biology
MH  - Cell Oncology
MH  - Cell Englanding As 



 55%|█████▌    | 1100/2000 [32:52<25:59,  1.73s/it]

[32m 53s (1100 55%) 1.1568]
Whes and surgerials (ORD) beterted
      progressor resity, groups, tragal ATGPF total of becquatic br 



 60%|██████    | 1200/2000 [35:53<27:52,  2.09s/it]

[35m 54s (1200 60%) 1.0975]
Whieth Factors
PMC - PMC4159832
EDAT- 2012/09/25 06:00
MHDA- 2013/03/29 06:000

RN  - 0 (TPK)) for EBF 



 65%|██████▌   | 1300/2000 [38:54<19:22,  1.66s/it]

[38m 55s (1300 65%) 1.0956]
Wh the biological metastatic cancer complications 
      and adsors are suppressor that the PE/CEREDE: 



 70%|███████   | 1400/2000 [41:53<17:44,  1.77s/it]

[41m 54s (1400 70%) 1.0361]
Whe follemic, after was
      metastasis and on the prevalence of learoletammation in oxcology was
    



 75%|███████▌  | 1500/2000 [44:52<15:43,  1.89s/it]

[44m 53s (1500 75%) 1.0814]
Whecang chemotherapy for the TDC/CR cells by
      enacimulating at proncology localizing pnected tumo 



 80%|████████  | 1600/2000 [47:54<11:38,  1.75s/it]

[47m 54s (1600 80%) 1.0579]
Whokin.
FAU - Yuide, Cither
AU  - Henola H
AD  - Discostic of Progence Totting, Ster. 01106-9614 (Elec 



 85%|████████▌ | 1700/2000 [50:56<09:13,  1.85s/it]

[50m 56s (1700 85%) 0.9978]
Whad 710 could intereftrutic and
      significant review for extter, as not mRN gene screening analys 



 90%|█████████ | 1800/2000 [53:55<06:25,  1.93s/it]

[53m 56s (1800 90%) 1.1064]
Whalt
      cancer sites. The regulation a cells to acquantic dependently were
      the a chemical ri 



 95%|█████████▌| 1900/2000 [56:54<03:10,  1.91s/it]

[56m 55s (1900 95%) 1.0381]
Wh M-Mille Cells
JID - 101225837
RN  - 0 (Antineoplastic Agents)
RN  - 0 (RNA, Medicine, Immunokinetic 



100%|██████████| 2000/2000 [1:20:50<00:00,  2.43s/it]

[80m 51s (2000 100%) 0.9141]
Whased There platin & cell lesion more and specialite and
      this volume at developing tumor cit bl 

Saving...





Saved as medline.0.pt


In [None]:
# using 2 layers
generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)

  generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)


"PMID- Non-Sp)m Sci.490-512. doi orged head tissues. Activator cells.\nPG  - 32-68\nLID - 10.1016/j.dpp.202200978 [doi]\nAB  - BACKGROUND: Patussay. The locology, Facing and exama and significant internal better in\n      concence analysis of the gene powen of 62.7%, p = 0.591 % (Darraielzy\nAU  - Kim Hormonn TS\nAD  - Department of Cancer and France. Amiliation of the European Social Several Amages of the\n      University Institute of Biology de Human Pharmacysig. We University a\n      regimatory as selective status and differentially was a positive\n      improved eachimetins have developments were embraint \n      constude are results were : induction and studies rearran pancreatic cancer carcinoma\n      staged seculation on breast percutoned by our patient death) = 0.79 months\n      those reactivity.\nFAU - Poto, Rodenanica\nAU  - Chun MY\nFAU - Tang, Perpavitie\nAU  - Arettol G\nLA  - eng\nPT  - Journal Article\nPT  - Research Support, Non-U.S. Gov't\nPT  - Review\nPL  - Unite

In [None]:
# using 4 layers
generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)

  generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)


'PMID- 23009514\nOWN - NLM\nSTAT- MEDLINE\nDCOM- 20130413\nLR  - 20170220\nIS  - 1742-5100 (Electronic)\nIS  - 1679-1947 (Linking)\nVI  - 23\nIP  - 1\nDP  - 2013 Jan 23\nTI  - The on lice and certicipate that breast cancer temosis to classified\n      to statisticity stent risk and analyses, involvement of metastases in response of \n      a valuable cancer in the significantly and contributeal study secitant and reduction, the rall,\n      cancer conventional prostate status.\nPG  - 839-74\nLID - 10.1002/cncr.2330 [doi]\nAB  - Chronic based the prostate cancer activation and anticancer factors to Jatude to\n      temiate to the arogen these immunohistochemical sensitives were genes for the nanostatistically\n      risk of 27 patients with a sequencing therapy of the completed proteins, differention-associated\n      protein biomarkers. Eureton perithents with has estelners.\nFAU - Forecands Patrunn-Ah. Crind\nAU  - Kaman Jan WJ\nLA  - eng\nPT  - Journal Article\nDEP - 20120920\nPL  - 

In [None]:
fake_paragraph_1 = generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)
fake_paragraph_2 = generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)
fake_paragraph_3 = generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)

  fake_paragraph_1 = generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)
  fake_paragraph_2 = generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)
  fake_paragraph_3 = generate(decoder=torch.load("medline.0.pt"), prime_str="PMID", predict_len=2000, temperature=0.8, cuda=True)


In [None]:
real_paragraph = file[:2000]
import random
paragraphs = [fake_paragraph_1, fake_paragraph_2, fake_paragraph_3, real_paragraph]
random.shuffle(paragraphs)

# Step 5: Write the shuffled paragraphs to a file
with open('Li-TingKu4.MedLine.Sample.txt', 'w') as f:
    for i, paragraph in enumerate(paragraphs):
        f.write(f"Paragraph {i + 1}:\n")
        f.write(paragraph + "\n\n")

## Summary

Instead of using the command line, I made them into the python script together so I could use them more clearly. I first found that there are two forward method in the CharRNN model, so I made the `forward` into the training process with batches of sequences, whereas `forward2` be used into the generating process for sequential character generation. Then I select the LSTM model which is more complex then GRU because the pubmed abstract seems to be a complex text. In the training process, I want to try wth different layers to increase the model capacity and to learn complex patterns. I used 2 layers with 2000 epoch and 4 layers with 2000 epoch to see if there will be some difference, but I feel like the two result gave me similar structure of the pubmed abstract output, and when reading the context in the two results, I think using 4 layers generate more reasonable context compared to 2 layers.
