In [1]:
import scrapy
import scrapy.crawler as crawler
from scrapy.utils.log import configure_logging
from multiprocessing import Process, Queue
from twisted.internet import reactor
from pydispatch import dispatcher
import logging
import json
import re
import os
import pandas as pd
import numpy as np
import gc
import copy

from datasets import load_dataset
from datasets.arrow_dataset import Dataset
# from transformers import AutoTokenizer
# from transformers import AutoModelForSequenceClassification
# from transformers import DistilGPT2Model
# from transformers import DistilGPT2Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from datasets import load_metric

import torch

from bs4 import BeautifulSoup
from lyricsgenius import Genius
import pickle

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('nlp')
logger.setLevel(logging.INFO)

[nltk_data] Downloading package punkt to /home/masdevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Fetch sentences

In [2]:
def get_web_data(function):
    def f(q):
        try:
            json_data = function()
            q.put(json.dumps(json_data))
        except Exception as e:
            q.put(e)
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()
    gc.collect()
    try:
        json_data = json.loads(result)
    except:
        raise result
    return json_data

## USA government RSS channels parsing

In [3]:
def get_usagov_rss_data():
    storage = {}
    try:
        class UsaGovSpider(scrapy.Spider):
            name = "usagov_checker"
            start_urls = ['https://www.state.gov/rss-feeds/']
            
            def parse(self, response, depth=0):
                if depth == 0:
                    a_selectors = response.xpath("//a")
                    for selector in a_selectors:
                        link = selector.xpath("@href").extract_first()
                        if 'rss' in link:
                            yield response.follow(link, self.parse, cb_kwargs={'depth' : 1})
                else:
                    storage[str(response.url)] = response.body.decode("utf-8")

        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(UsaGovSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Joke RSS channels parsing

In [4]:
def get_joke_rss_data():
    storage = {}
    try:
        class JokeSpider(scrapy.Spider):
            name = "joke_checker"
            #start_urls = ['https://blog.feedspot.com/jokes_rss_feeds/'] # 403 error - bot detected, access denied
            start_urls = [
                'http://www.jokesoftheday.net/jokes-feed/', 
                'http://www.funnyshortjokes.com/feed',
                'https://laffgaff.com/feed/',
                'https://www.keeplaughingforever.com/blog//blog-feed.xml',
                'https://lite92.ca/category/joke-of-the-day/feed/',
                'https://www.thelaughline.com/feed/',
                'https://www.jokesbykids.com/riddle/feed/',
                'https://newbloggycat.com/category/good-clean-jokes/feed/',
                'https://www.super-funny.com/feed/',
                'http://slay.me/feed',
                'https://www.funny-jokes-quotes-sayings.com/funny-jokes.xml',
                'http://modest-jokes.blogspot.com/feeds/posts/default?alt=rss',
                'https://badkidsjokes.tumblr.com/rss',
                'http://chillyjokes.com/chillyjokes/jokes/feed/',
                'https://laughbreak.com/pictures/its-okay-to-feed-the-ducks-bread-now/',
                'https://acornyjokeaday.tumblr.com/rss',
                'https://somejokeshere.blogspot.com/feeds/posts/default?alt=rss'
            ]

            def parse(self, response, depth=0):
                storage[str(response.url)] = response.body.decode("utf-8")
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(JokeSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Rap texts parsing

In [5]:
def read_genius_token():
    with open('genius_token.txt', 'r') as f:
        token = f.readline().strip()
    return token


In [6]:
def get_rap_data():
    genius = Genius(read_genius_token())
    storage = {}
    try:
        class RapSpider(scrapy.Spider):
            name = "rap_checker"
            start_urls = [
                'https://bestlifeonline.com/funniest-rap-lyrics/'
            ]

            def parse(self, response, depth=0):
                resp = response.xpath("//div[@class='content noskimwords']").xpath("//h2/text()")
                for item in resp:
                    match = re.match("[0-9]+\. ([A-Za-z0-9\s\-\'\.]+), \"([A-Za-z0-9\s\-\'\.]+)\"", item.get().strip())
                    if match:
                        try:
                            author = match.group(1)
                            song = match.group(2)
                            name = author+'|'+song
                            print((author, song))
                            song = genius.search_song(song, author)
    #                         print(song.lyrics)
                            print(len(song.lyrics))
                            storage[name] = song.lyrics
                        except Exception as e:
                            print(f'!!! Exception passed on {name}', str(e))
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(RapSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

In [7]:
storage_file = 'storage.pkl'
def is_rawdata_exist(storage_file):
    return os.path.exists(storage_file)
    # return False

if is_rawdata_exist(storage_file):
    print('LOADING')
    with open(storage_file, 'rb') as f:
        storage = pickle.load(f)
else:
    print('FROM WEB')
    storage = {}  
    storage['usagov'] = get_web_data(get_usagov_rss_data)
    storage['joke'] = get_web_data(get_joke_rss_data)
    storage['rap'] = get_web_data(get_rap_data)
    with open(storage_file, 'wb') as f:
        pickle.dump(storage, f)


LOADING


# Preprocess sentences for HuggingFace

In [8]:
storage_to_process = copy.deepcopy(storage)

map_labels = {'usagov' : 0, 'joke' : 1, 'rap' : 2}

words_limit = 128

def remove_html_tags(text):
    html_detector = re.compile(r'<.*?>')
    return html_detector.sub(r'', text)

def remove_links(text):
    links_remover = re.compile(r'https?://\S+|www\.\S+')
    return links_remover.sub(r'', text)

def preprocess_html(body, local_processed_data, topic_key):
#     l = 0
    soup = BeautifulSoup(body)
    p_tags = soup.find_all('p')
    for each in p_tags:
        processed = sent_tokenize(each.text.encode('ascii', 'ignore').decode('ascii').lower().strip())
        for processed_item in processed:
            processed_item = re.sub("[\d\.]+", '', processed_item).strip()
            if len(processed_item) > 0:
                if topic_key == 'joke':
                    if processed_item.startswith('a: ') or processed_item.startswith('q: '):
                        res_proc_item = processed_item[3:]
                    else:
                        res_proc_item = processed_item
                elif topic_key == 'usagov':
                    if len(processed_item.split(' ')) > 3:
                        res_proc_item = processed_item
                else:
                    raise Exception(f'Unknown topic_key for preprocess_html(): {topic_key}')
                res_proc_item = ' '.join(res_proc_item.split(' ')[:words_limit])
                local_processed_data.append(res_proc_item)
#             l += len(processed)
#     print(l)

def get_data_rss(substorage, topic_key):
    local_processed_data = []
    for link, body in substorage.items():
        preprocess_html(body, local_processed_data, topic_key)
    local_processed_labels = [map_labels[topic_key]] * len(local_processed_data)
    with open(f'tmp_{topic_key}.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    return local_processed_data, local_processed_labels

def join_rap_sentences(local_lines, lines_in_group):
    rap_sentences = []
    idx = 0
    while idx < len(local_lines):
        line = local_lines[idx].strip().lower()
        if len(line) == 0:
            del local_lines[idx]
            continue
        if idx + lines_in_group > len(local_lines):
            res_proc_item = line
        else:
            second_line = local_lines[idx + 1].strip().lower()
            if len(second_line) == 0:
                del local_lines[idx + 1]
                continue
            res_proc_item = line + ' ' + second_line
        rap_sentences.append(' '.join(res_proc_item.split(' ')[:words_limit]))
        idx += lines_in_group
    return rap_sentences

# TODO how to have a deal with slang?
def get_approx_sentences_from_lyrics(lyrics, local_processed_data):
    couplets = re.split('\[.*\]' ,lyrics)
    for couplet in couplets:
        local_lines = couplet.split('\n')
        
        if len(local_lines) == 1:
            continue
        lines_in_group = 2
        rap_sentences = join_rap_sentences(local_lines, lines_in_group)
        local_processed_data.extend(rap_sentences)

def get_data_lyrics(substorage, topic_key):
    local_processed_data = []
    for lyrics in substorage.values():
        approx_sentences = get_approx_sentences_from_lyrics(lyrics, local_processed_data)
    local_processed_labels = [map_labels[topic_key]] * len(local_processed_data)
    with open(f'tmp_{topic_key}.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    return local_processed_data, local_processed_labels

processed_data = []
processed_labels = []
for topic_key in map_labels.keys():
    if topic_key == 'joke' or topic_key == 'usagov':
        local_processed_data, local_processed_labels = get_data_rss(storage_to_process[topic_key], topic_key)
        processed_data.extend(local_processed_data)
        processed_labels.extend(local_processed_labels)
    elif topic_key == 'rap':
        local_processed_data, local_processed_labels = get_data_lyrics(storage_to_process[topic_key], topic_key)
        processed_data.extend(local_processed_data)
        processed_labels.extend(local_processed_labels)
    else:
        raise Exception(f'Unknown topic_key: {topic_key}')

    



In [9]:
df_text_data = pd.DataFrame({'text' : processed_data, 'label' : processed_labels})

In [10]:
df_text_data

Unnamed: 0,text,label
0,"antony j blinken, secretary of state",0
1,"on behalf of the american people, i would like...",0
2,the united states and tanzania enjoy a longsta...,0
3,as tanzania also will soon celebrate years of...,0
4,i send my best wishes to the tanzanian people ...,0
...,...,...
25165,"i arrived in front of the dormitory ""yo, could...",2
25166,they showed me where it was for the moment i d...,2
25167,"so, i came to her room and opened the door oh,...",2
25168,a fella tongue-kissin' my girl in her mouth i ...,2


In [11]:
df_text_data['label'].value_counts()

0    22494
2     1375
1     1301
Name: label, dtype: int64

In [12]:
f = lambda x: len(x.split(' '))
lens = df_text_data['text'].apply(f)
lens.value_counts()

8      1180
4      1061
14      985
7       961
12      945
       ... 
86        2
106       2
94        2
109       2
97        2
Name: text, Length: 103, dtype: int64

## Reduce number of samples of the largest class

In [13]:
index_of_usagov = df_text_data[df_text_data['label'] == 0].index
index_of_usagov
remove_n = 18000
drop_indices = np.random.choice(index_of_usagov, remove_n, replace=False)
df_text_data_red = df_text_data.drop(drop_indices).reset_index()

In [14]:
df_text_data_red['label'].value_counts()

0    4494
2    1375
1    1301
Name: label, dtype: int64

# HuggingFace transformers is coming!

## Prepare dataset

In [15]:
hf_dataset = Dataset.from_pandas(df_text_data_red).remove_columns('index')

In [16]:
hf_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7170
})

In [17]:
hf_dataset[5000]

{'text': "what is will smith's favourite band?", 'label': 1}

In [18]:
model_name = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=32, return_tensors='pt')

tokenized_datasets = hf_dataset.map(tokenize_function)

  0%|          | 0/7170 [00:00<?, ?ex/s]

In [20]:
tokenized_datasets

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 7170
})

In [21]:
len(tokenized_datasets['input_ids'][0][0])

32

In [22]:
print(tokenized_datasets['text'][0])

antony j blinken, secretary of state


In [23]:
labels = torch.tensor(tokenizer.encode("hello my name is", add_special_tokens=False)).unsqueeze(0)

In [24]:
labels

tensor([[31373,   616,  1438,   318]])

In [25]:
gc.collect()
torch.cuda.empty_cache()

In [26]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [27]:
generator("Hello, I'm a language model", max_length=200, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I\'m a language model, it has a lot of complexity.\n\nIt is hard to think of anything beyond the possibility of some kind of "interpreting" of a project. What if my job is to help people understand how a project works, the results you see in their reports and the details in the documentation don\'t always match those of other projects? What if a person is thinking of building on that theory, what if you really want to get into those other projects that are not already in development?\nIn the future I think that I can get into some aspects of project development and how some other ideas, techniques and code can be implemented more efficiently to prevent issues from happening.\nAll in all, I know that a lot of people are frustrated in getting into projects (as with many others) because they are interested in getting started on something that is not a great project, or there is always something they are looking forward to when they go back to work. It'},
 {'gen

In [28]:
gc.collect()
torch.cuda.empty_cache()

# training_args = TrainingArguments(
#     do_train=True,
#     do_eval=True,
#     evaluation_strategy="steps",
#     eval_steps=1000,
#     logging_dir="exp/bart/logs",
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     gradient_accumulation_steps=2,
#     eval_accumulation_steps=1,
#     output_dir="test_trainer",
#     num_train_epochs=50)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset
# )

In [29]:
from torch.utils.data import DataLoader


train_dataloader = DataLoader(tokenized_datasets, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets, batch_size=8)

In [30]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [31]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [32]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [33]:
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [34]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        print(len(batch['input_ids']))
#         print(batch)
#         print(batch.keys())
#         print(len(batch['text']))
        ids = torch.stack(batch['input_ids'][0]).to(device)
#         attention_mask = torch.stack(batch['attention_mask'][0]).to(device)
        print(ids)
        print(ids.shape)
#         print(attention_mask.shape)
#         outputs = model(input_ids=ids, attention_mask=attention_mask)
#         print(outputs)
#         print(outputs.logits.shape)
#         print(outputs)
#         print(outputs.logits)
#         print(outputs.logits.shape)
#         print('#' * 50)

        inputs_for_gener = model.prepare_inputs_for_generation(input_ids=ids)

    #         print(inputs_for_gener['perm_mask'].shape)
#         print(inputs_for_gener['target_mapping'].shape)
        print(inputs_for_gener)
        print(inputs_for_gener['input_ids'].shape)
    
#         outputs = model(input_ids=inputs_for_gener['input_ids'], perm_mask=inputs_for_gener['perm_mask'], 
#                         target_mapping=inputs_for_gener['target_mapping'])
#         print(outputs)
#         print(outputs.logits.shape)
        
    
        sarfw
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

  0%|          | 0/2691 [00:00<?, ?it/s]

1
tensor([[  392,   270,    67,  1050, 43395,   568,    12, 21078],
        [ 1312,   318,   824,    76,  2756,   356,  5769,   560],
        [  892,   257,   290,   635,    25,  2067,    82, 21019],
        [  356, 29210,   262,  7634,   356,  1561,  6513,   268],
        [  303,   284,  3230, 18656,   655,   259,  7252,    25],
        [  477,   262,  2324,  5745,   466,  3256,   828, 43395],
        [ 1775,  1109,  2995,   326,   407,   651,   319,  8900],
        [  262,   326,  1448,   423,   423, 43701,  5153,    11],
        [13530,   612,   357,  2722,   281,     6, 32316, 12759],
        [  351,   389,   786,  4918,  4296,  5385,   739,  2888],
        [  543,  2972,    70,   422,   284,  4341,  3670,  6106],
        [  674,  9619,     8,   778,  2148,   259, 46955,   354],
        [  334,   284,  5597,    76, 50256,     6,   286,    11],
        [   74, 10996,   329,   287, 50256,   257,   262,  5875],
        [ 3201,  1022,  1933,   262, 50256,  1256,   719,   345],
        

NameError: name 'sarfw' is not defined