In [2]:
import os
import json
import random
from glob import glob
import re
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import transformers

# Eng

qa_jokes_filepath = os.path.join('data', 'qa_jokes.csv')
short_jokes_filepath = os.path.join('data', 'short_jokes.csv')
transcripts_path = os.path.join('data', 'transcripts')

qa_jokes_prep_outpath = os.path.join('data', 'prep', 'qa_jokes_gpt2.txt')
short_jokes_prep_outpath = os.path.join('data', 'prep', 'short_jokes_gpt2.txt')
transcripts_prep_outpath = os.path.join('data', 'prep', 'transcripts_gpt2.txt')

## Preprocess data

In [3]:
def fix_encoding(s):
    """Skip characters that can't be encoded by standard encoder."""
    return s.encode('utf-8', 'ignore').decode('utf8', 'ignore')

# TODO: Add &amp;nbsp;  &gt;  &lt;
regexps = [ # Regexp for the special chars
    (re.compile('♦'), '*'),
    (re.compile('\n *\n'), '\n'), # Replace multiple newlines with one
    (re.compile(r' {2,}'), ' '), # Replace multiple spaces with one
]

def fix_text(s):
    for regexp in regexps:
        s = regexp[0].sub(regexp[1], s)
    return fix_encoding(s.strip())

def write_to_file(file_path, text, encoding=None):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding=encoding) as out_file:
        out_file.write(text)

START_DOC_TOKEN = ''
END_DOC_TOKEN = '<|endoftext|>'

### QA Jokes
Taken from [here](https://www.kaggle.com/jiriroz/qa-jokes).

Cleaned from noisy/non-represantable data. (Notes, already inserted "Q"/"A" tags)

In [4]:
qa_jokes = pd.read_csv(qa_jokes_filepath)
qa_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38232 entries, 0 to 38231
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        38232 non-null  int64 
 1   Question  38232 non-null  object
 2   Answer    38232 non-null  object
dtypes: int64(1), object(2)
memory usage: 896.2+ KB


In [5]:
qa_corpus = []
for _, question, answer in qa_jokes.values:
    qa_corpus.append(f'{START_DOC_TOKEN}[QUESTION] {question}\n[ANSWER] {answer}\n{END_DOC_TOKEN}')

qa_corpus = '\n'.join(map(lambda s: fix_text(s), qa_corpus))

write_to_file(qa_jokes_prep_outpath, qa_corpus, encoding='utf-8')

### Transcripts
Scrapped dataset of stand up's transcripts from [scrapsfromtheloft.com](scrapsfromtheloft.com).

### Short Jokes
Dataset taken from [here](https://www.kaggle.com/abhinavmoudgil95/short-jokes).

Also cleaned up. (Twitter tags, f@ck/@sshole words, samples with link to smth)

In [6]:
short_jokes = pd.read_csv('/home/karim/Documents/Studies/NLP/Project/data/short_jokes.csv')
short_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230974 entries, 0 to 230973
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      230974 non-null  int64 
 1   Joke    230974 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


In [7]:
# Find QA jokes in short jokes
from nltk.tokenize import sent_tokenize
qa_jokes_in_short_jokes = []
for i, (_, joke) in enumerate(short_jokes.values):
    sentences = sent_tokenize(joke.strip())
    if len(sentences) < 4 and len(sentences) > 1 and sentences[0][-1] == '?':
        qa_jokes_in_short_jokes.append(i)

In [8]:
# Show random one
ind = np.random.randint(len(qa_jokes_in_short_jokes))
short_jokes.iloc[qa_jokes_in_short_jokes[ind]].Joke

"What's the difference between a job and a wife? After 10 years, a job still sucks!"

In [9]:
print(type(short_jokes.iloc[qa_jokes_in_short_jokes[ind]]))
print(short_jokes.iloc[qa_jokes_in_short_jokes[ind]])

<class 'pandas.core.series.Series'>
ID                                                 201384
Joke    What's the difference between a job and a wife...
Name: 200799, dtype: object


In [None]:
print(qa_jokes_in_short_jokes)
print(ind)
short_jokes

In [21]:
x, y = short_jokes.iloc[qa_jokes_in_short_jokes[ind]]
print(x, y)

201384 What's the difference between a job and a wife? After 10 years, a job still sucks!


In [24]:
# Add them to qa jokes
for i, joke in short_jokes.iloc[qa_jokes_in_short_jokes].values:
    sentences = sent_tokenize(joke.strip())
    question, answer = sentences[0], ' '.join(sentences[1:])
    qa_corpus += fix_encoding(f'{START_DOC_TOKEN}[QUESTION] {question}\n[ANSWER] {answer}\n{END_DOC_TOKEN}\n')

write_to_file(qa_jokes_prep_outpath, qa_corpus)

In [25]:
# Delete found qa in short
short_jokes = short_jokes.drop(qa_jokes_in_short_jokes)

In [26]:
short_jokes_corpus = ''
for i, joke in short_jokes.values:
    short_jokes_corpus += fix_encoding(f'{START_DOC_TOKEN}{joke.strip()}\n{END_DOC_TOKEN}\n')


write_to_file(short_jokes_prep_outpath, short_jokes_corpus)

## Train

In [27]:
def create_cmd_command(python_path, script, kwargs, flags):
    args = ' '.join(f'--{k}={v}' for k, v in kwargs.items())
    args += ' ' + ' '.join(f'--{f}' for f in flags)
    return f'{python_path} {script} {args}'

In [33]:
python_path = r'python3'
script = r'run_lm_finetuning.py'
train_kwargs = {
    'model_type': 'gpt2', # gpt2, ctrl, openai-gpt, xlnet, transfo-xl, xlm
    'model_name_or_path':'gpt2',
    'output_dir':'output',
    'block_size': 512,
    'learning_rate': 1e-6,
    'num_train_epochs': 3,
    'per_gpu_train_batch_size': 2,
    'gradient_accumulation_steps': 8,
    'save_steps': 1000,
#     'max_steps': 20000,
}

# set CUDA_VISIBLE_DEVICES=1

train_outputs = [
    'gpt2',
    'output',   # Transcripts 1e-6, 5 grad_acc 4
    'output_1', # Transcripts 1e-5, 2 grad_acc 8
    'output_2', # short_jokes 1e-5, 2 grad_acc 8
    'output_3', # short_jokes 1e-6, 5 grad_acc 8
    'output_4', # short_jokes 1e-7, 2 grad_acc 2
    'output_5', # qa_jokes    1e-5, 3 grad_acc 8 - most funny yet
    'output_6', # qa_jokes    1e-5, 3 grad_acc 4
    'output_7', # qa_jokes    1e-6, 2 grad_acc 2
    'output_8', # qa_jokes    1e-6, 10 grad_acc 8
    
]

train_flags = [
    'do_train',
    'overwrite_output_dir',
#     'fp16',
]

In [34]:
# train_kwargs['train_data_file'] = transcripts_prep_outpath
# train_kwargs['train_data_file'] = short_jokes_prep_outpath
train_kwargs['train_data_file'] = qa_jokes_prep_outpath

In [35]:
train_kwargs['model_name_or_path'] = train_outputs[0]
# train_kwargs['model_name_or_path'] = os.path.join('models', train_outputs[0])
train_kwargs['output_dir'] = os.path.join('models', train_outputs[1])
print('From:', train_kwargs['model_name_or_path'], '\nTo:', train_kwargs['output_dir'])

From: gpt2 
To: models/output


In [36]:
cmd_command = create_cmd_command(python_path, script, train_kwargs, train_flags)
print(cmd_command)

python3 run_lm_finetuning.py --model_type=gpt2 --model_name_or_path=gpt2 --output_dir=models/output --block_size=512 --learning_rate=1e-06 --num_train_epochs=3 --per_gpu_train_batch_size=2 --gradient_accumulation_steps=8 --save_steps=1000 --train_data_file=data/prep/qa_jokes_gpt2.txt --do_train --overwrite_output_dir


### Generate

In [37]:
gen_script = r'run_generation.py'
generate_kwargs = {
    'model_type': train_kwargs['model_type'],
    'model_name_or_path': train_kwargs['output_dir'],
    'prompt': rf'"{START_DOC_TOKEN}[QUESTION]"',
#     'prompt': '"The reddit enters a bar"',
    'length': 100,
    'stop_token': f'"{END_DOC_TOKEN}"',
    'temperature': 0.9, # temperature of 1.0 has no effect, lower tend toward greedy sampling
    'repetition_penalty': 1.05, # primarily useful for CTRL model; in that case, use 1.2
    'k': 50,
    'p': 0.95,
#     'padding_text': '', # Padding text for Transfo-XL and XLNet.
    'num_return_sequences':40,
}
gen_flags = []

cmd_command = create_cmd_command(python_path, gen_script, generate_kwargs, gen_flags)
print(cmd_command)

python3 run_generation.py --model_type=gpt2 --model_name_or_path=models/output --prompt="[QUESTION]" --length=100 --stop_token="<|endoftext|>" --temperature=0.9 --repetition_penalty=1.05 --k=50 --p=0.95 --num_return_sequences=40 
