# Pickup Line Generator

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Creating a ML folder for all machine learning purposes

and a pickup folder for this project


In [None]:
%cd /content/drive/My\ Drive
%mkdir ML
%cd ML
!ls

%mkdir pickup
!ls

/content/drive/My Drive
mkdir: cannot create directory ‘ML’: File exists
/content/drive/My Drive/ML
pickup
mkdir: cannot create directory ‘pickup’: File exists
pickup


In [None]:
%cd /content/drive/My\ Drive/ML/pickup

/content/drive/My Drive/ML/pickup


Setting up to save model to cloud

In [None]:
!sudo apt-get install git-lfs
!git clone https://huggingface.co/HJK/PickupLineGenerator 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 13 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,845 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 146374 files and directories c

Global imports

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 16.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 48.3MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 55.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=3e32537afae24bc664

In [None]:
import torch
import json
import urllib3
import pathlib
import shutil
import requests
import os
import re
import random

Variables

In [None]:
# Have global access
trainer = None
tokenizer = None

# HYPER-PARAMETERS
ALLOW_NEW_LINES = False     
LEARNING_RATE = 1.372e-4
EPOCHS = 4

Create and save pickup line dataset

In [None]:
def createData():
  f = 'data.txt'
  data = []

  with open(f,'r') as dtf:
      data = dtf.readlines()

  return cleanData(data)

def cleanData(data):
  for i in range(len(data)):
    data[i] = fix_text(data[i].strip())

  return data

def fix_text(text):
  text = text.replace('&amp;', '&')
  text = text.replace('&lt;', '<')
  text = text.replace('&gt;', '>')
  return text


def saveData():
  try:
    # create dataset
    data = createData()

    print(data[:5])

    # create a file based on multiple epochs with tweets mixed up
    seed_data = random.randint(0,2**32-1)
    dataRandom = random.Random(seed_data)
    total_text = '<|endoftext|>'
    for _ in range(EPOCHS):
        dataRandom.shuffle(data)
        total_text += '<|endoftext|>'.join(data) + '<|endoftext|>'
    
    if len(total_text) / EPOCHS < 6000:
        # need about 4000 chars for one data sample (but depends on spaces, etc)
        raise ValueError(f"Error: This doesn't seem like enough text")

    with open(f'datatrain.txt', 'w') as f:
        f.write(total_text)

  except Exception as e:
    print('\nAn error occured in saving the dataset...\n')
    print(e)

Finetuning on pickup line data


In [None]:
def finetune():
  import transformers
  from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TextDataset, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments,
    get_cosine_schedule_with_warmup)

  try:                
    # Setting up pre-trained neural network
    global trainer, tokenizer
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    model = AutoModelForCausalLM.from_pretrained('gpt2', cache_dir=pathlib.Path('cache').resolve())
    block_size = tokenizer.model_max_length
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=f'datatrain.txt', block_size=block_size, overwrite_cache=True)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    seed = random.randint(0,2**32-1)
    training_args = TrainingArguments(
      output_dir=f'output/',
      overwrite_output_dir=True,
      do_train=True,
      num_train_epochs=1,
      per_device_train_batch_size=1,
      prediction_loss_only=True,
      logging_steps=5,
      save_steps=0,
      seed=seed,
      learning_rate = LEARNING_RATE)

    # Set up Trainer
    trainer = Trainer(
      model=model,
      tokenizer=tokenizer,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset)
    
    # Update lr scheduler
    train_dataloader = trainer.get_train_dataloader()
    num_train_steps = len(train_dataloader)
    trainer.create_optimizer_and_scheduler(num_train_steps)
    trainer.lr_scheduler = get_cosine_schedule_with_warmup(
      trainer.optimizer,
      num_warmup_steps=0,
      num_training_steps=num_train_steps)
    
    # Training neural network
    trainer.train()

    # set model config parameters
    trainer.model.config.task_specific_params['text-generation'] = {
      'do_sample': True,
      'min_length': 10,
      'max_length': 160,
      'temperature': 1.,
      'top_p': 0.95,
      'prefix': '<|endoftext|>'}

    # save new model files
    trainer.save_model("pickup")

  except Exception as e:
    print('\nAn error occured in fine tuning...\n')
    print(e)

Predicting


In [None]:
def shorten_text(text, max_char):
  while len(text) > max_char:
      text = ' '.join(text.split()[:-1]) + '…'
  return text
  
def predict(start):
  # get start sentence
  start = start.strip()

  predictions = []
  try:
    # prepare input
    start_with_bos = '<|endoftext|>' + start
    encoded_prompt = trainer.tokenizer(start_with_bos, add_special_tokens=False, return_tensors="pt").input_ids
    encoded_prompt = encoded_prompt.to(trainer.model.device)

    # prediction
    output_sequences = trainer.model.generate(
      input_ids=encoded_prompt,
      max_length=160,
      min_length=10,
      temperature=1.,
      top_p=0.95,
      do_sample=True,
      num_return_sequences=10
      )
    generated_sequences = []

    # decode prediction
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
      generated_sequence = generated_sequence.tolist()
      text = trainer.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
      if not ALLOW_NEW_LINES:
        limit = text.find('\n')
        text = text[: limit if limit != -1 else None]
      generated_sequences.append(text.strip())
    
    for i, g in enumerate(generated_sequences):
      predictions.append([start, g])
    
    for i, g in enumerate(generated_sequences):
      pred = start + ' → ' + g[len(start):].strip()
      print(pred)

  except Exception as e:
    print('\nAn error occured in predicting...\n')
    print(e)

Train

In [None]:
saveData()
finetune()

['Nice pants. Can I test the zipper?', "You're like my menorah's candles…getting hotter every day.", 'I still exist, in case you have forgotten.', "If a fat man puts you in a bag at night, don't worry I told Santa I wanted you for Christmas.", "You know, if I were you, I'd have sex with me."]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




Token indices sequence length is longer than the specified maximum sequence length for this model (174237 > 1024). Running this sequence through the model will result in indexing errors


Step,Training Loss
5,3.8328
10,3.3756
15,3.3027
20,3.264
25,3.1921
30,3.3297
35,2.933
40,3.075
45,2.8986
50,2.9002


Upload to cloud


In [None]:
%cd /content/drive/My\ Drive/ML/pickup/PickupLineGenerator
trainer.save_model("./")
tokenizer.save_pretrained("./")

'''
!git config --global user.email "hjk.businessemail@gmail.com"
!git config --global user.name "HJK"
!git add --all
!git status
!git commit -m "version 0.1"
!git push
'''
# couldn't get this to work, just download and use git on your computer to upload

/content/drive/My Drive/ML/pickup/PickupLineGenerator
On branch main
Your branch is ahead of 'origin/main' by 3 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
On branch main
Your branch is ahead of 'origin/main' by 3 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
fatal: could not read Username for 'https://huggingface.co': No such device or address


Predicting Demo



In [None]:
predict("I love")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I love → you like a nerd loves socks.
I love → you. You've been my butt ever since, I know you've been a joy to find my heart. Want to fuck?
I love → you as an artist, yet don't know you want to know how beautiful he is?
I love → you like I could ever imagine!
I love → you like Mickey Mouse loves candy.
I love → you like a squirrel loves a duck.
I love → you like a baby loves his tongue.
I love → you because you remind me of a piece of art that was made in my honor.
I love → you. We have a few more hours until my name comes up.
I love → you like my sister loved the Titanic.
