In [1]:
import torch
from torch.nn.functional import softmax
from gpt2 import GPT2LanguageModel
import numpy as np
from tqdm import tqdm_notebook
import time

model_name = "345M"
model = GPT2LanguageModel(model_name='117M') if model_name == "117M" else GPT2LanguageModel(model_name='345M')

In [2]:
### Clean the GPT prompts
with open("GPT prompts.txt", "r") as f:
    prompts = f.readlines()
prompts = [prompts[i].strip() for i in range(len(prompts))]
print(len(prompts))
print(len(list(set(prompts))))
reduced_prompts = list(set(prompts))
with open("GPT prompts stripped.txt", "w") as f:
    for p in reduced_prompts:
        f.write(p+"\n")

28709
28586


In [18]:
### Write the scripts for the cluster
print()
steps = 1500
for line in range(0, 29000, steps):
    script = f"""#!/bin/bash -l

# Number of Nodes
#SBATCH -N 1

#SBATCH -t 20:00:00

#SBATCH --mail-type=ALL

#SBATCH --mail-user=kevin.trebing@ucdconnect.ie

#SBATCH --job-name=prompts{line}-{line+steps}

cd $SLURM_SUBMIT_DIR

# load the required modules
module load pytorch

# execute the code
python3 probabilities.py {line}"""
    
    with open(f"scripts/prompts{line}.sh", "w", newline="\n") as f:
        f.write(script)





In [19]:
# Write a test script
line = 1337
script = f"""#!/bin/bash -l

# Number of Nodes
#SBATCH -N 1

#SBATCH -t 20:00:00

#SBATCH --mail-type=ALL

#SBATCH --mail-user=kevin.trebing@ucdconnect.ie

#SBATCH --job-name=prompts{line}-{line+steps}

cd $SLURM_SUBMIT_DIR

# load the required modules
module load pytorch

# execute the code
python3 probabilities.py {line}"""
    
with open(f"scripts/testScript{line}.sh", "w", newline="\n") as f:
    f.write(script)

In [1]:
# Create a vector/dataframe from the created vectors
import os
import pickle
import pandas as pd
import numpy as np

df = pd.DataFrame(columns=["Context", "Wordvector"])
fullDict = {"Context":[], "Wordvector":[]}
files = os.listdir("wordvectors")

contexts = []
wordvectors = np.zeros((len(files), 269))
for i, context in enumerate(files):
    with open(f"wordvectors/{context}", "rb") as f:
        vector = pickle.load(f)
        c, v = list(vector.items())[0]
        contexts.append(c)
        wordvectors[i] = np.array(v)
#         fullDict["Context"].append(c)
#         fullDict["Wordvector"].append(np.array(v))
        
df = pd.DataFrame(data=fullDict)
# df["Wordvector"]
with open("Wordvectors.pkl", "wb") as f:
    pickle.dump(wordvectors, f)
with open("Contexts.pkl", "wb") as f:
    pickle.dump(contexts, f)
    
print(wordvectors.shape)

(28577, 269)


In [19]:
### Use the wordvectors 
import time
import pickle
import numpy as np
from scipy.spatial.distance import cdist

with open("Wordvectors.pkl", "rb") as f:
    wordvectors = pickle.load(f)
with open("Contexts.pkl", "rb") as f:
    contexts = pickle.load(f)
with open("emotions.txt", "r") as f:
    emotions = f.readlines()
emotions = [e.strip() for e in emotions]

# start_time = time.time()
# distances = cdist(wordvectors, wordvectors, metric="euclidean")
# print(f"Calculation with cdist took {time.time()-start_time} seconds")

In [18]:

def getDistanceToOthers(vector, vectorMatrix, contexts, metric="euclidean"):
    distanceToOthers = cdist([vector], vectorMatrix, metric="euclidean")[0]
    order = np.argsort(distanceToOthers)
    n = 10
    # Closest
    print("Closest")
    for ind in order[:n]:
        print(f"Dist: {distanceToOthers[ind]} Context: {contexts[ind]}")
    print()
    print("Farthest")
    for ind in order[::-1][:n]:
        print(f"Dist: {distanceToOthers[ind]} Context: {contexts[ind]}")
    

print(wordvectors.shape)
print(distances.shape)

vectorToCompare = wordvectors[3]
getDistanceToOthers(vectorToCompare, wordvectors, contexts)


ascInds = np.argsort(wordvectors, axis=1)
descInds = np.array([arr[::-1] for arr in ascInds])
a = np.array([1,2,3,4,5,6,7,8,9])
b = a[::-1]
c = np.array(a[::-1])
print(np.max(wordvectors))
print(ascInds)
print(descInds)

a[3] = 1337
print(a)
print(b)
print(c)

(28577, 269)
(28577, 28577)
Closest
Dist: 0.0 Context: abusing power. Emperor Caligula doesn't think that restraints are
Dist: 0.04617828658700693 Context: spreading revolution. Oliver Cromwell doesn't think that government controls are
Dist: 0.047542937621971725 Context: spreading revolution. Spartacus doesn't think that government controls are
Dist: 0.05607030256286821 Context: spreading revolution. Che Guevara doesn't think that government controls are
Dist: 0.06456561063155321 Context: running a communist country. Joseph Stalin doesn't think that lower taxes are
Dist: 0.06495946339172486 Context: promoting liberal values. Voltaire doesn't think that state controls are
Dist: 0.06762592070330024 Context: promoting pacifism. Martin Luther King doesn't think that nuclear weapons are
Dist: 0.06831838544827147 Context: spreading revolution. Leon Trotsky doesn't think that government controls are
Dist: 0.06920538616567201 Context: publishing soft pornography. Hugh Hefner doesn't think tha

In [4]:
#### Clean writing prompts from artefacts such as "[WP]" and edits like "thanks for the gold kind stranger!"
import re

with open("data/wpdump.txt", "r", encoding="utf-8") as f:
    writingprompts = f.readlines()
# print(writingprompts[:10])

with open("data/writingprompts_cleaned.txt", "w", encoding="utf-8") as f:
    for i, line in enumerate(writingprompts):
        if ("[WP]" in line) or ("<|endoftext|>" in line):
            lineToSave = line
        else:
            # remove lines with /r/ or u/ or www. or EDIT
            if (bool(re.search(r"www\..*\.", line)) or 
                bool(re.search(r"https?:", line)) or 
                bool(re.search(r"r\/(\w|\d)+", line)) or 
                bool(re.search(r"\b(([E|e]dit)|(EDIT))\**[:|\s]", line)) or
                bool(re.search(r"u\/(\w|\d)+", line))):
                continue
            # Remove weird "no space characters"
            lineToSave = line.replace("&#x200B;", "")
            # Remove multiple linebreaks and spaces
            lineToSave = re.sub(r"\n+", " ", lineToSave)
#             lineToSave = re.sub("(\\[_|\-])+", "", lineToSave)
        f.write(lineToSave)

with open("data/writingprompts_cleaned.txt", "r", encoding="utf-8") as f:
    strings = f.readlines()
    fullString = "".join(strings)
    fullString = re.sub(r"[\*|\-|_]+\s*<\|endoftext\|>", "<|endoftext|>", fullString)
    fullString = re.sub(r"(\\_)+", "", fullString)
    fullString = re.sub(r"(\\\-)+", "", fullString)
    fullString = re.sub(r"\bStory:", "\n", fullString)
    fullString = re.sub(r"\bPrompt:", "[WP]", fullString)

    # Get rid of boldening and section-limiter (from reddit formatting)
    fullString = fullString.replace("*", "")
    fullString = fullString.replace("---", "")
    # Get rid of double whitespaces (but not newlines)
    fullString = re.sub(r"[^\S\r\n]{2,}", " ", fullString)
    with open("data/writingprompts_cleaned_fully.txt", "w", encoding="utf-8") as f2:
        f2.write(fullString)
    
# Remove everything between "---", "***", "___" and "<|endoftext|>"

In [2]:
# Write a script for training gpt-2 on the GPU node

script = f"""#!/bin/bash -l

# Number of Nodes
#SBATCH -N 1

# Request GPU node
#SBATCH --partition=csgpu

#SBATCH -t 00:10:00

#SBATCH --mail-type=ALL

#SBATCH --mail-user=kevin.trebing@ucdconnect.ie

#SBATCH --job-name=testGPT2

cd $SLURM_SUBMIT_DIR

# load the required modules
module load tensorflowgpu

# execute the code
python3 train.py --model_name 345M --run_name run345M --dataset data\writingprompts.npz --batch_size 1 --top_p 0.9 --save_every 2500 --sample_every 1000"""
    
with open(f"scripts/testGPU.sh", "w", newline="\n") as f:
    f.write(script)

In [32]:
line = "genies. Edit: First Reddit gold! Thank "
print(bool(re.search(r"\b(([E|e]dit)|(EDIT))[:|\s]", line)))
print(bool(re.search("edit", line)))
bool(re.search(r"\b(([E|e]dit)|(EDIT))[:|\s]", line))

True
False


True

In [None]:
# Creating story prompts from storyville
