In [1]:
import scrapy
import scrapy.crawler as crawler
from scrapy.utils.log import configure_logging
from multiprocessing import Process, Queue
from twisted.internet import reactor
from pydispatch import dispatcher
import logging
import json
import re
import os
import pandas as pd
import numpy as np
import gc
import copy

from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_metric


from bs4 import BeautifulSoup
from lyricsgenius import Genius
import pickle

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('nlp')
logger.setLevel(logging.INFO)

[nltk_data] Downloading package punkt to /home/masdevas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataset = load_dataset("yelp_review_full")



  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset['train']

Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})

In [4]:
df1 = pd.DataFrame({"a": [1, 2, 3]})
df1_ = Dataset.from_pandas(df1)

In [5]:
df1_

Dataset({
    features: ['a'],
    num_rows: 3
})

# Fetch sentences

In [6]:
def get_web_data(function):
    def f(q):
        try:
            json_data = function()
            q.put(json.dumps(json_data))
        except Exception as e:
            q.put(e)
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()
    gc.collect()
    try:
        json_data = json.loads(result)
    except:
        raise result
    return json_data

## USA government RSS channels parsing

In [7]:
def get_usagov_rss_data():
    storage = {}
    try:
        class UsaGovSpider(scrapy.Spider):
            name = "usagov_checker"
            start_urls = ['https://www.state.gov/rss-feeds/']
            
            def parse(self, response, depth=0):
                if depth == 0:
                    a_selectors = response.xpath("//a")
                    for selector in a_selectors:
                        link = selector.xpath("@href").extract_first()
                        if 'rss' in link:
                            yield response.follow(link, self.parse, cb_kwargs={'depth' : 1})
                else:
                    storage[str(response.url)] = response.body.decode("utf-8")

        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(UsaGovSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Joke RSS channels parsing

In [8]:
def get_joke_rss_data():
    storage = {}
    try:
        class JokeSpider(scrapy.Spider):
            name = "joke_checker"
            #start_urls = ['https://blog.feedspot.com/jokes_rss_feeds/'] # 403 error - bot detected, access denied
            start_urls = [
                'http://www.jokesoftheday.net/jokes-feed/', 
                'http://www.funnyshortjokes.com/feed',
                'https://laffgaff.com/feed/',
                'https://www.keeplaughingforever.com/blog//blog-feed.xml',
                'https://lite92.ca/category/joke-of-the-day/feed/',
                'https://www.thelaughline.com/feed/',
                'https://www.jokesbykids.com/riddle/feed/',
                'https://newbloggycat.com/category/good-clean-jokes/feed/',
                'https://www.super-funny.com/feed/',
                'http://slay.me/feed',
                'https://www.funny-jokes-quotes-sayings.com/funny-jokes.xml',
                'http://modest-jokes.blogspot.com/feeds/posts/default?alt=rss',
                'https://badkidsjokes.tumblr.com/rss',
                'http://chillyjokes.com/chillyjokes/jokes/feed/',
                'https://laughbreak.com/pictures/its-okay-to-feed-the-ducks-bread-now/',
                'https://acornyjokeaday.tumblr.com/rss',
                'https://somejokeshere.blogspot.com/feeds/posts/default?alt=rss'
            ]

            def parse(self, response, depth=0):
                storage[str(response.url)] = response.body.decode("utf-8")
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(JokeSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Rap texts parsing

In [9]:
def read_genius_token():
    with open('genius_token.txt', 'r') as f:
        token = f.readline().strip()
    return token

genius = Genius(read_genius_token())

In [10]:
def get_rap_data():
    storage = {}
    try:
        class RapSpider(scrapy.Spider):
            name = "rap_checker"
            start_urls = [
                'https://bestlifeonline.com/funniest-rap-lyrics/'
            ]

            def parse(self, response, depth=0):
                resp = response.xpath("//div[@class='content noskimwords']").xpath("//h2/text()")
                for item in resp:
                    match = re.match("[0-9]+\. ([A-Za-z0-9\s\-\'\.]+), \"([A-Za-z0-9\s\-\'\.]+)\"", item.get().strip())
                    if match:
                        try:
                            author = match.group(1)
                            song = match.group(2)
                            name = author+'|'+song
                            print((author, song))
                            song = genius.search_song(song, author)
    #                         print(song.lyrics)
                            print(len(song.lyrics))
                            storage[name] = song.lyrics
                        except Exception as e:
                            print(f'!!! Exception passed on {name}', str(e))
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(RapSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

In [11]:
storage_file = 'storage.pkl'
def is_rawdata_exist(storage_file):
    return os.path.exists(storage_file)
    # return False

if is_rawdata_exist(storage_file):
    print('LOADING')
    with open(storage_file, 'rb') as f:
        storage = pickle.load(f)
else:
    print('FROM WEB')
    storage = {}  
    storage['usagov'] = get_web_data(get_usagov_rss_data)
    storage['joke'] = get_web_data(get_joke_rss_data)
    storage['rap'] = get_web_data(get_rap_data)
    with open(storage_file, 'wb') as f:
        pickle.dump(storage, f)


LOADING


# Preprocess sentences for HuggingFace

In [12]:
storage_to_process = copy.deepcopy(storage)

map_labels = {'usagov' : 0, 'joke' : 1, 'rap' : 2}

words_limit = 120

def remove_html_tags(text):
    html_detector = re.compile(r'<.*?>')
    return html_detector.sub(r'', text)

def remove_links(text):
    links_remover = re.compile(r'https?://\S+|www\.\S+')
    return links_remover.sub(r'', text)

def preprocess_html(body, local_processed_data, topic_key):
#     l = 0
    soup = BeautifulSoup(body)
    p_tags = soup.find_all('p')
    for each in p_tags:
        processed = sent_tokenize(each.text.encode('ascii', 'ignore').decode('ascii').lower().strip())
        for processed_item in processed:
            processed_item = re.sub("[\d\.]+", '', processed_item).strip()
            if len(processed_item) > 0:
                if topic_key == 'joke':
                    if processed_item.startswith('a: ') or processed_item.startswith('q: '):
                        res_proc_item = processed_item[3:]
                    else:
                        res_proc_item = processed_item
                elif topic_key == 'usagov':
                    if len(processed_item.split(' ')) > 3:
                        res_proc_item = processed_item
                else:
                    raise Exception(f'Unknown topic_key for preprocess_html(): {topic_key}')
                res_proc_item = ' '.join(res_proc_item.split(' ')[:words_limit])
                local_processed_data.append(res_proc_item)
#             l += len(processed)
#     print(l)

def get_data_rss(substorage, topic_key):
    local_processed_data = []
    for link, body in substorage.items():
        preprocess_html(body, local_processed_data, topic_key)
    local_processed_labels = [map_labels[topic_key]] * len(local_processed_data)
    with open(f'tmp_{topic_key}.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    return local_processed_data, local_processed_labels

def join_rap_sentences(local_lines, lines_in_group):
    rap_sentences = []
    idx = 0
    while idx < len(local_lines):
        line = local_lines[idx].strip().lower()
        if len(line) == 0:
            del local_lines[idx]
            continue
        if idx + lines_in_group > len(local_lines):
            res_proc_item = line
        else:
            second_line = local_lines[idx + 1].strip().lower()
            if len(second_line) == 0:
                del local_lines[idx + 1]
                continue
            res_proc_item = line + ' ' + second_line
        rap_sentences.append(res_proc_item)
        idx += lines_in_group
    return rap_sentences

# TODO how to have a deal with slang?
def get_approx_sentences_from_lyrics(lyrics, local_processed_data):
    couplets = re.split('\[.*\]' ,lyrics)
    for couplet in couplets:
        local_lines = couplet.split('\n')
        
        if len(local_lines) == 1:
            continue
        lines_in_group = 2
        rap_sentences = join_rap_sentences(local_lines, lines_in_group)
        local_processed_data.extend(rap_sentences)

def get_data_lyrics(substorage, topic_key):
    local_processed_data = []
    for lyrics in substorage.values():
        approx_sentences = get_approx_sentences_from_lyrics(lyrics, local_processed_data)
    local_processed_labels = [map_labels[topic_key]] * len(local_processed_data)
    with open(f'tmp_{topic_key}.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    return local_processed_data, local_processed_labels

processed_data = []
processed_labels = []
for topic_key in map_labels.keys():
    if topic_key == 'joke' or topic_key == 'usagov':
        local_processed_data, local_processed_labels = get_data_rss(storage_to_process[topic_key], topic_key)
        processed_data.extend(local_processed_data)
        processed_labels.extend(local_processed_labels)
    elif topic_key == 'rap':
        local_processed_data, local_processed_labels = get_data_lyrics(storage_to_process[topic_key], topic_key)
        processed_data.extend(local_processed_data)
        processed_labels.extend(local_processed_labels)
    else:
        raise Exception(f'Unknown topic_key: {topic_key}')

    



In [13]:
df_text_data = pd.DataFrame({'text' : processed_data, 'label' : processed_labels})

In [14]:
df_text_data

Unnamed: 0,text,label
0,"antony j blinken, secretary of state",0
1,"on behalf of the american people, i would like...",0
2,the united states and tanzania enjoy a longsta...,0
3,as tanzania also will soon celebrate years of...,0
4,i send my best wishes to the tanzanian people ...,0
...,...,...
25165,"i arrived in front of the dormitory ""yo, could...",2
25166,they showed me where it was for the moment i d...,2
25167,"so, i came to her room and opened the door oh,...",2
25168,a fella tongue-kissin' my girl in her mouth i ...,2


In [15]:
df_text_data['label'].value_counts()

0    22494
2     1375
1     1301
Name: label, dtype: int64

## Reduce number of samples of the largest class

In [16]:
index_of_usagov = df_text_data[df_text_data['label'] == 0].index
index_of_usagov
remove_n = 18000
drop_indices = np.random.choice(index_of_usagov, remove_n, replace=False)
df_text_data_red = df_text_data.drop(drop_indices).reset_index()

In [17]:
df_text_data_red['label'].value_counts()

0    4494
2    1375
1    1301
Name: label, dtype: int64

# HuggingFace transformers is coming!

## Prepare dataset

In [18]:
hf_dataset = Dataset.from_pandas(df_text_data_red).remove_columns('index')

In [19]:
hf_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 7170
})

In [20]:
hf_dataset[5000]

{'text': "what is will smith's favourite band?", 'label': 1}

In [21]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True).shuffle(seed=42)

  0%|          | 0/650 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
model

In [None]:
tokenized_datasets

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=50)
metric = load_metric("bleurt")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    compute_metrics=metric,
)

In [None]:
trainer.train()