In [1]:
import scrapy
import scrapy.crawler as crawler
from scrapy.utils.log import configure_logging
from multiprocessing import Process, Queue
from twisted.internet import reactor
from pydispatch import dispatcher
import logging
import json
import re
import os
import pandas as pd
import numpy as np
import gc
import copy

from tqdm.auto import tqdm

from datasets import load_dataset
from datasets.arrow_dataset import Dataset
# from transformers import AutoTokenizer
# from transformers import AutoModelForSequenceClassification
# from transformers import DistilGPT2Model
# from transformers import DistilGPT2Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import pipeline

import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader

from bs4 import BeautifulSoup
from lyricsgenius import Genius
import pickle

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('nlp')
logger.setLevel(logging.INFO)

[nltk_data] Downloading package punkt to /home/masdevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Fetch sentences

In [2]:
def get_web_data(function):
    def f(q):
        try:
            json_data = function()
            q.put(json.dumps(json_data))
        except Exception as e:
            q.put(e)
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()
    gc.collect()
    try:
        json_data = json.loads(result)
    except:
        raise result
    return json_data

## USA government RSS channels parsing

In [3]:
def get_usagov_rss_data():
    storage = {}
    try:
        class UsaGovSpider(scrapy.Spider):
            name = "usagov_checker"
            start_urls = ['https://www.state.gov/rss-feeds/']
            
            def parse(self, response, depth=0):
                if depth == 0:
                    a_selectors = response.xpath("//a")
                    for selector in a_selectors:
                        link = selector.xpath("@href").extract_first()
                        if 'rss' in link:
                            yield response.follow(link, self.parse, cb_kwargs={'depth' : 1})
                else:
                    storage[str(response.url)] = response.body.decode("utf-8")

        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(UsaGovSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Joke RSS channels parsing

In [4]:
def get_joke_rss_data():
    storage = {}
    try:
        class JokeSpider(scrapy.Spider):
            name = "joke_checker"
            #start_urls = ['https://blog.feedspot.com/jokes_rss_feeds/'] # 403 error - bot detected, access denied
            start_urls = [
                'http://www.jokesoftheday.net/jokes-feed/', 
                'http://www.funnyshortjokes.com/feed',
                'https://laffgaff.com/feed/',
                'https://www.keeplaughingforever.com/blog//blog-feed.xml',
                'https://lite92.ca/category/joke-of-the-day/feed/',
                'https://www.thelaughline.com/feed/',
                'https://www.jokesbykids.com/riddle/feed/',
                'https://newbloggycat.com/category/good-clean-jokes/feed/',
                'https://www.super-funny.com/feed/',
                'http://slay.me/feed',
                'https://www.funny-jokes-quotes-sayings.com/funny-jokes.xml',
                'http://modest-jokes.blogspot.com/feeds/posts/default?alt=rss',
                'https://badkidsjokes.tumblr.com/rss',
                'http://chillyjokes.com/chillyjokes/jokes/feed/',
                'https://laughbreak.com/pictures/its-okay-to-feed-the-ducks-bread-now/',
                'https://acornyjokeaday.tumblr.com/rss',
                'https://somejokeshere.blogspot.com/feeds/posts/default?alt=rss'
            ]

            def parse(self, response, depth=0):
                storage[str(response.url)] = response.body.decode("utf-8")
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(JokeSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Rap texts parsing

In [5]:
def read_genius_token():
    with open('genius_token.txt', 'r') as f:
        token = f.readline().strip()
    return token


In [6]:
def get_rap_data():
    genius = Genius(read_genius_token())
    storage = {}
    try:
        class RapSpider(scrapy.Spider):
            name = "rap_checker"
            start_urls = [
                'https://bestlifeonline.com/funniest-rap-lyrics/'
            ]

            def parse(self, response, depth=0):
                resp = response.xpath("//div[@class='content noskimwords']").xpath("//h2/text()")
                for item in resp:
                    match = re.match("[0-9]+\. ([A-Za-z0-9\s\-\'\.]+), \"([A-Za-z0-9\s\-\'\.]+)\"", item.get().strip())
                    if match:
                        try:
                            author = match.group(1)
                            song = match.group(2)
                            name = author+'|'+song
                            print((author, song))
                            song = genius.search_song(song, author)
    #                         print(song.lyrics)
                            print(len(song.lyrics))
                            storage[name] = song.lyrics
                        except Exception as e:
                            print(f'!!! Exception passed on {name}', str(e))
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(RapSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

In [7]:
storage_file = 'storage.pkl'
def is_rawdata_exist(storage_file):
    return os.path.exists(storage_file)
    # return False

if is_rawdata_exist(storage_file):
    print('LOADING')
    with open(storage_file, 'rb') as f:
        storage = pickle.load(f)
else:
    print('FROM WEB')
    storage = {}  
    storage['usagov'] = get_web_data(get_usagov_rss_data)
    storage['joke'] = get_web_data(get_joke_rss_data)
    storage['rap'] = get_web_data(get_rap_data)
    with open(storage_file, 'wb') as f:
        pickle.dump(storage, f)


LOADING


# Preprocess sentences for HuggingFace

In [8]:
storage_to_process = copy.deepcopy(storage)

map_labels = {'usagov' : 0, 'joke' : 1, 'rap' : 2}

words_limit = 32
words_at_least = 4

def remove_html_tags(text):
    html_detector = re.compile(r'<.*?>')
    return html_detector.sub(r'', text)

def remove_links(text):
    links_remover = re.compile(r'https?://\S+|www\.\S+')
    return links_remover.sub(r'', text)

def split_sentence_to_approproate_length(res_proc_item):
    sentences = []
    splitted = res_proc_item.split(' ')
    for idx in range(0, len(splitted), words_limit):
        part_of_words = splitted[idx:idx+words_limit]
        if len(part_of_words) < words_at_least:
            continue
        else:
            sentences.append(' '.join(part_of_words))
    return sentences

def preprocess_html(body, local_processed_data, topic_key):
#     l = 0
    soup = BeautifulSoup(body)
    p_tags = soup.find_all('p')
    for each in p_tags:
        processed = sent_tokenize(each.text.encode('ascii', 'ignore').decode('ascii').lower().strip())
        for processed_item in processed:
            processed_item = re.sub("[\d\.]+", '', processed_item).strip()
            if len(processed_item) > 0:
                if topic_key == 'joke':
                    if processed_item.startswith('a: ') or processed_item.startswith('q: '):
                        res_proc_item = processed_item[3:]
                    else:
                        res_proc_item = processed_item
                elif topic_key == 'usagov':
                    if len(processed_item.split(' ')) > 3:
                        res_proc_item = processed_item
                else:
                    raise Exception(f'Unknown topic_key for preprocess_html(): {topic_key}')
#                 res_proc_item = ' '.join([:words_limit])
                local_processed_data.extend(split_sentence_to_approproate_length(res_proc_item))
#             l += len(processed)
#     print(l)

def get_data_rss(substorage, topic_key):
    local_processed_data = []
    for link, body in substorage.items():
        preprocess_html(body, local_processed_data, topic_key)
    local_processed_labels = [map_labels[topic_key]] * len(local_processed_data)
    with open(f'tmp_{topic_key}.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    return local_processed_data, local_processed_labels

def join_rap_sentences(local_lines, lines_in_group):
    rap_sentences = []
    idx = 0
    while idx < len(local_lines):
        line = local_lines[idx].strip().lower()
        if len(line) == 0:
            del local_lines[idx]
            continue
        if idx + lines_in_group > len(local_lines):
            res_proc_item = line
        else:
            second_line = local_lines[idx + 1].strip().lower()
            if len(second_line) == 0:
                del local_lines[idx + 1]
                continue
            res_proc_item = line + ' ' + second_line
        splitted = res_proc_item.split(' ')[:words_limit]
        if len(splitted) >= words_at_least:
            rap_sentences.append(' '.join(splitted))
        idx += lines_in_group
    return rap_sentences

# TODO how to have a deal with slang?
def get_approx_sentences_from_lyrics(lyrics, local_processed_data):
    couplets = re.split('\[.*\]' ,lyrics)
    for couplet in couplets:
        local_lines = couplet.split('\n')
        
        if len(local_lines) == 1:
            continue
        lines_in_group = 2
        rap_sentences = join_rap_sentences(local_lines, lines_in_group)
        local_processed_data.extend(rap_sentences)

def get_data_lyrics(substorage, topic_key):
    local_processed_data = []
    for lyrics in substorage.values():
        approx_sentences = get_approx_sentences_from_lyrics(lyrics, local_processed_data)
    local_processed_labels = [map_labels[topic_key]] * len(local_processed_data)
    with open(f'tmp_{topic_key}.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    return local_processed_data, local_processed_labels

processed_data = []
processed_labels = []
for topic_key in map_labels.keys():
    if topic_key == 'joke' or topic_key == 'usagov':
        local_processed_data, local_processed_labels = get_data_rss(storage_to_process[topic_key], topic_key)
        processed_data.extend(local_processed_data)
        processed_labels.extend(local_processed_labels)
    elif topic_key == 'rap':
        local_processed_data, local_processed_labels = get_data_lyrics(storage_to_process[topic_key], topic_key)
        processed_data.extend(local_processed_data)
        processed_labels.extend(local_processed_labels)
    else:
        raise Exception(f'Unknown topic_key: {topic_key}')

    



In [9]:
df_text_data = pd.DataFrame({'text' : processed_data, 'label' : processed_labels})

In [10]:
df_text_data

Unnamed: 0,text,label
0,"antony j blinken, secretary of state",0
1,"on behalf of the american people, i would like...",0
2,the united states and tanzania enjoy a longsta...,0
3,as tanzania also will soon celebrate years of...,0
4,"in the areas of health, education, governance,...",0
...,...,...
28238,"i arrived in front of the dormitory ""yo, could...",2
28239,they showed me where it was for the moment i d...,2
28240,"so, i came to her room and opened the door oh,...",2
28241,a fella tongue-kissin' my girl in her mouth i ...,2


In [11]:
df_text_data['label'].value_counts()

0    25755
2     1351
1     1137
Name: label, dtype: int64

In [12]:
df_text_data['text'].apply(lambda x: len(x.split(' '))).value_counts()

32    4603
4     1347
8     1341
5     1175
7     1149
9     1137
14    1115
6     1112
11    1069
12    1061
10    1021
17     994
13     943
16     922
15     876
18     874
21     838
19     828
22     685
23     659
20     630
24     626
26     515
28     513
25     492
27     472
30     467
29     405
31     374
Name: text, dtype: int64

## Reduce number of samples of the largest class

In [13]:
index_of_usagov = df_text_data[df_text_data['label'] == 0].index
index_of_usagov
remove_n = 24500
drop_indices = np.random.choice(index_of_usagov, remove_n, replace=False)
df_text_data_red = df_text_data.drop(drop_indices).reset_index()

In [14]:
df_text_data_red['label'].value_counts()

2    1351
0    1255
1    1137
Name: label, dtype: int64

# HuggingFace transformer is coming!

## Prepare dataset

In [15]:
hf_dataset = Dataset.from_pandas(df_text_data_red).remove_columns('index')

In [16]:
hf_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 3743
})

In [17]:
hf_dataset[1000]

{'text': 'additionally, there have been credible reports of targeting of investigative journalists and restrictions on media content',
 'label': 0}

In [18]:
model_name = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=32, return_tensors='pt')

tokenized_datasets = hf_dataset.map(tokenize_function)
gc.collect()
torch.cuda.empty_cache()

  0%|          | 0/3743 [00:00<?, ?ex/s]

In [20]:
tokenized_datasets

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3743
})

In [21]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
generated = generator("Hello, I'm a language model", max_length=30, num_return_sequences=3)
gc.collect()
torch.cuda.empty_cache()
print(generated)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model by which I create and use programming language functions to program in the language and in the languages, such as Java."}, {'generated_text': "Hello, I'm a language model.\nThis is a language that requires a framework to handle, and that's a very good point. I've"}, {'generated_text': "Hello, I'm a language model and I know a bit about it. I don't get the impression people want to hear of you, I'm"}]


In [22]:
model.device

device(type='cpu')

In [25]:
model.save_pretrained("model_storage")
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
# from transformers import GPT2Config

# cpu = torch.device("cpu")
# model_state_dict = model.state_dict()
device = torch.device("cuda")
trained_model = GPT2LMHeadModel.from_pretrained("model_storage").to(device)

gc.collect()
torch.cuda.empty_cache()

def check_weights_cum_norm(model):
    cum_norm = 0
    for name, W in model.named_parameters():
        cum_norm += W.norm(2)
    return cum_norm

def eval_on_loader(trained_model, loader):
    with torch.no_grad():
    for batch in train_dataloader:
        ids = torch.stack(batch['input_ids'][0]).to(device)
        outputs = trained_model(input_ids=ids)
        logits = outputs['logits']
        sentences = logits.transpose(0, 1)
        labels = ids.transpose(0, 1)
        losses = []
        for sentence, label in zip(sentences, labels):
            sentence_cloned = sentence.clone()
            label_cloned = label.clone()
            with torch.no_grad():
                torch.nn.functional.relu(sentence_cloned, inplace=True)
            losses.append(criterion(sentence_cloned, label_cloned))
        return sum(losses)

batch_size = 64
train_dataloader = DataLoader(tokenized_datasets, shuffle=True, batch_size=batch_size)
# eval_dataloader = DataLoader(tokenized_datasets, shuffle=True, batch_size=batch_size)
optimizer = AdamW(trained_model.parameters(), lr=0.001)
scheduler = ExponentialLR(optimizer, gamma=0.85)

criterion = CrossEntropyLoss()
num_epochs = 10
progress_bar_epoch = tqdm(range(num_epochs))

num_batches = len(tokenized_datasets) // batch_size
if len(tokenized_datasets) % batch_size:
    num_batches += 1
for epoch in range(num_epochs):
    print(f'Epoch {epoch} started. Eval on train: {eval_on_loader(trained_model, train_dataloader)}. Cum norm: {check_weights_cum_norm(trained_model)}')
    progress_bar_batch = tqdm(range(num_batches))
    for batch in train_dataloader:
        optimizer.zero_grad()
#         print(len(batch['input_ids']))
        ids = torch.stack(batch['input_ids'][0]).to(device)
        outputs = trained_model(input_ids=ids)
        logits = outputs['logits']
        sentences = logits.transpose(0, 1)
        labels = ids.transpose(0, 1)
#         print(sentences.shape)
#         print(labels.shape)
        losses = []
        for sentence, label in zip(sentences, labels):
            sentence_cloned = sentence.clone()
            label_cloned = label.clone()
#             print(sentence_cloned.shape, label_cloned.shape)
            with torch.no_grad():
                torch.nn.functional.relu(sentence_cloned, inplace=True)
            losses.append(criterion(sentence_cloned, label_cloned))
        
        res_loss = sum(losses)
        res_loss.backward()
        # clip grads?
        optimizer.step()
        progress_bar_batch.update(1)
        gc.collect()
        torch.cuda.empty_cache()
    progress_bar_epoch.update(1)
    scheduler.step()


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 0 started. Eval on train: 689.0586547851562. Cum norm: 5464.58984375


  0%|          | 0/59 [00:00<?, ?it/s]

Epoch 1 started. Eval on train: 672.4696044921875. Cum norm: 5586.49755859375


  0%|          | 0/59 [00:00<?, ?it/s]

Epoch 2 started. Eval on train: 322.13385009765625. Cum norm: 5747.8359375


  0%|          | 0/59 [00:00<?, ?it/s]

Epoch 3 started. Eval on train: 154.06900024414062. Cum norm: 5842.2724609375


  0%|          | 0/59 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trained_model_cpu = trained_model.to(cpu)
generator = pipeline('text-generation', model=trained_model, tokenizer=tokenizer)
generated = generator("hello", max_length=30, num_return_sequences=3)
gc.collect()
torch.cuda.empty_cache()
print(generated)