In [74]:
import scrapy
import scrapy.crawler as crawler
from scrapy.utils.log import configure_logging
from multiprocessing import Process, Queue
from twisted.internet import reactor
from pydispatch import dispatcher
import logging
import json
import re
import os
import pandas as pd
import gc

from datasets import load_dataset
from datasets.arrow_dataset import Dataset

from bs4 import BeautifulSoup
from lyricsgenius import Genius
import pickle

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('nlp')
logger.setLevel(logging.INFO)

[nltk_data] Downloading package punkt to /home/masdevas/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
dataset = load_dataset("yelp_review_full")



  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset['train']

Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})

In [4]:
df1 = pd.DataFrame({"a": [1, 2, 3]})
df1_ = Dataset.from_pandas(df1)

In [5]:
df1_

Dataset({
    features: ['a'],
    num_rows: 3
})

# Fetch sentences

In [6]:
def get_web_data(function):
    def f(q):
        try:
            json_data = function()
            q.put(json.dumps(json_data))
        except Exception as e:
            q.put(e)
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()
    gc.collect()
    try:
        json_data = json.loads(result)
    except:
        raise result
    return json_data

## USA government RSS channels parsing

In [7]:
def get_usagov_rss_data():
    storage = {}
    try:
        class UsaGovSpider(scrapy.Spider):
            name = "usagov_checker"
            start_urls = ['https://www.state.gov/rss-feeds/']
            
            def parse(self, response, depth=0):
                if depth == 0:
                    a_selectors = response.xpath("//a")
                    for selector in a_selectors:
                        link = selector.xpath("@href").extract_first()
                        if 'rss' in link:
                            yield response.follow(link, self.parse, cb_kwargs={'depth' : 1})
                else:
                    storage[str(response.url)] = response.body.decode("utf-8")

        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(UsaGovSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Joke RSS channels parsing

In [8]:
def get_joke_rss_data():
    storage = {}
    try:
        class JokeSpider(scrapy.Spider):
            name = "joke_checker"
            #start_urls = ['https://blog.feedspot.com/jokes_rss_feeds/'] # 403 error - bot detected, access denied
            start_urls = [
                'http://www.jokesoftheday.net/jokes-feed/', 
                'http://www.funnyshortjokes.com/feed',
                'https://laffgaff.com/feed/',
                'https://www.keeplaughingforever.com/blog//blog-feed.xml',
                'https://lite92.ca/category/joke-of-the-day/feed/',
                'https://www.thelaughline.com/feed/',
                'https://www.jokesbykids.com/riddle/feed/',
                'https://newbloggycat.com/category/good-clean-jokes/feed/',
                'https://www.super-funny.com/feed/',
                'http://slay.me/feed',
                'https://www.funny-jokes-quotes-sayings.com/funny-jokes.xml',
                'http://modest-jokes.blogspot.com/feeds/posts/default?alt=rss',
                'https://badkidsjokes.tumblr.com/rss',
                'http://chillyjokes.com/chillyjokes/jokes/feed/',
                'https://laughbreak.com/pictures/its-okay-to-feed-the-ducks-bread-now/',
                'https://acornyjokeaday.tumblr.com/rss',
                'https://somejokeshere.blogspot.com/feeds/posts/default?alt=rss'
            ]

            def parse(self, response, depth=0):
                storage[str(response.url)] = response.body.decode("utf-8")
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(JokeSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

## Rap texts parsing

In [9]:
def read_genius_token():
    with open('genius_token.txt', 'r') as f:
        token = f.readline().strip()
    return token

genius = Genius(read_genius_token())

In [10]:
def get_rap_data():
    storage = {}
    try:
        class RapSpider(scrapy.Spider):
            name = "rap_checker"
            start_urls = [
                'https://bestlifeonline.com/funniest-rap-lyrics/'
            ]

            def parse(self, response, depth=0):
                resp = response.xpath("//div[@class='content noskimwords']").xpath("//h2/text()")
                for item in resp:
                    match = re.match("[0-9]+\. ([A-Za-z0-9\s\-\'\.]+), \"([A-Za-z0-9\s\-\'\.]+)\"", item.get().strip())
                    if match:
                        try:
                            author = match.group(1)
                            song = match.group(2)
                            name = author+'|'+song
                            print((author, song))
                            song = genius.search_song(song, author)
    #                         print(song.lyrics)
                            print(len(song.lyrics))
                            storage[name] = song.lyrics
                        except Exception as e:
                            print(f'!!! Exception passed on {name}', str(e))
                
        runner = crawler.CrawlerRunner()
        deferred = runner.crawl(RapSpider)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        return storage
    except Exception as e:
        return {'exception' : str(e)}

In [23]:
storage_file = 'storage.pkl'
def is_rawdata_exist(storage_file):
    return os.path.exists(storage_file)
    # return False

if is_rawdata_exist(storage_file):
    print('LOADING')
    with open(storage_file, 'rb') as f:
        storage = pickle.load(f)
else:
    print('FROM WEB')
    storage = {}  
    storage['usagov'] = get_web_data(get_usagov_rss_data)
    storage['joke'] = get_web_data(get_joke_rss_data)
    storage['rap'] = get_web_data(get_rap_data)
    with open(storage_file, 'wb') as f:
        pickle.dump(storage, f)


LOADING


In [24]:
len(storage['joke'].keys())

17

In [25]:
storage.keys()

dict_keys(['usagov', 'joke', 'rap'])

In [26]:
len(storage['usagov'].keys())

27

In [27]:
list(storage['usagov'].values())[0]

'<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"\n\txmlns:content="http://purl.org/rss/1.0/modules/content/"\n\txmlns:wfw="http://wellformedweb.org/CommentAPI/"\n\txmlns:dc="http://purl.org/dc/elements/1.1/"\n\txmlns:atom="http://www.w3.org/2005/Atom"\n\txmlns:sy="http://purl.org/rss/1.0/modules/syndication/"\n\txmlns:slash="http://purl.org/rss/1.0/modules/slash/"\n\t>\n\n<channel>\n\t<title>Africa &#8211; United States Department of State</title>\n\t<atom:link href="https://www.state.gov/rss-feed/africa/feed/" rel="self" type="application/rss+xml" />\n\t<link>https://www.state.gov</link>\n\t<description></description>\n\t<lastBuildDate>Mon, 25 Apr 2022 15:01:24 +0000</lastBuildDate>\n\t<language>en-US</language>\n\t<sy:updatePeriod>\n\thourly\t</sy:updatePeriod>\n\t<sy:updateFrequency>\n\t1\t</sy:updateFrequency>\n\t<generator>https://wordpress.org/?v=5.9.2</generator>\n\n<image>\n\t<url>https://www.state.gov/wp-content/uploads/2022/04/cropped-dos_seal-32x32.png</url>\n\t<tit

In [28]:
len(storage['joke'].keys())

17

In [29]:
list(storage['joke'].values())[0]

'<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"\n\txmlns:content="http://purl.org/rss/1.0/modules/content/"\n\txmlns:wfw="http://wellformedweb.org/CommentAPI/"\n\txmlns:dc="http://purl.org/dc/elements/1.1/"\n\txmlns:atom="http://www.w3.org/2005/Atom"\n\txmlns:sy="http://purl.org/rss/1.0/modules/syndication/"\n\txmlns:slash="http://purl.org/rss/1.0/modules/slash/"\n\t xmlns:media="http://search.yahoo.com/mrss/" >\n\n<channel>\n\t<title>LaffGaff</title>\n\t<atom:link href="https://laffgaff.com/feed/" rel="self" type="application/rss+xml" />\n\t<link>https://laffgaff.com</link>\n\t<description>Really funny short jokes, puns, trivia, quotes and more!</description>\n\t<lastBuildDate>Thu, 14 Apr 2022 12:41:33 +0000</lastBuildDate>\n\t<language>en-US</language>\n\t<sy:updatePeriod>\n\thourly\t</sy:updatePeriod>\n\t<sy:updateFrequency>\n\t1\t</sy:updateFrequency>\n\t<generator>https://wordpress.org/?v=5.9.3</generator>\n\n<image>\n\t<url>https://laffgaff.com/wp-content/uploads/2018/0

In [30]:
len(storage['rap'].keys())

30

In [31]:
list(storage['rap'].values())[0]

"The World Lyrics[Verse 1: Casey Veggies]\nNice denim, nice women\nYeah she know that I’m quite winnin\nThis gon come at the right time, I’m ridin' round and I’m tryna get it\nFirst whip was a Audi coupe\nDrive that shit like a Honda Civic\nOne life, I’m tryna live it\nThat Hennessy is in my liver\nUsually I don’t drink but when I do it’s so exquisite\nI would probably be a bigger rapper if I would do, something different\nBut it’s no hype, it’s on sight\nI’d rather rap about my life\nNo judging, girl do yo thing\nWhy you worried bout what I think?\nI be rockin' them ripped up jeans\nI be movin' round with my team\nIt’s good to have different options, got different kicks from different shopping\nBound to get it, that ain’t no option\nWe gotta do it, put it all in music\nPeas and Carrots, yea we gon stay\nThat’s my brand like Frito Lay\n[Verse 2]\nWorried bout all the places we go, vacation in Rio\nI be in my speedos, everything wavy\nLike a torpedo, baby girl date me\nThen she gotta le

# Preprocess sentences for HuggingFace

In [75]:
map_labels = {'usagov' : 0, 'joke' : 0, 'rap' : 0}

def remove_html_tags(text):
    html_detector = re.compile(r'<.*?>')
    return html_detector.sub(r'', text)

def remove_links(text):
    links_remover = re.compile(r'https?://\S+|www\.\S+')
    return links_remover.sub(r'', text)

def preprocess_html(body, local_processed_data):
#     l = 0
    soup = BeautifulSoup(body)
    p_tags = soup.find_all('p')
    for each in p_tags:
        processed = sent_tokenize(each.text.encode('ascii', 'ignore').decode('ascii').lower().strip())
        for processed_item in processed:
            if len(processed_item) > 0:
                local_processed_data.append(processed_item)
#             l += len(processed)
#     print(l)

def get_data_usagov(substorage, label):
    local_processed_data = []
    for link, body in substorage.items():
        preprocess_html(body, local_processed_data)
    labels = [label] * len(local_processed_data)
    with open('tmp.json', 'w') as f:
        json.dump({'section' : local_processed_data}, f, indent=4, sort_keys=True)
    
processed_data = []
for topic_key in map_labels.keys():
    if topic_key == 'usagov':
        processed_data.extend(get_data_usagov(storage[topic_key]))
    

https://www.state.gov/rss-feed/africa/feed/
https://www.state.gov/rss-feed/treaties-new/feed/
https://www.state.gov/rss-feed/south-and-central-asia/feed/
https://www.state.gov/rss-feed/public-schedule/feed/
https://www.state.gov/rss-feed/western-hemisphere/feed/
https://www.state.gov/rss-feed/population-refugees-and-migration/feed/
https://www.state.gov/rss-feed/womens-issues/feed/
https://www.state.gov/rss-feed/near-east/feed/
https://www.state.gov/rss-feed/law-enforcement-narcotics-anti-corruption/feed/
https://www.state.gov/rss-feed/international-organizations/feed/
https://www.state.gov/rss-feed/trafficking-in-persons/feed/
https://www.state.gov/rss-feed/international-health-issues/feed/
https://www.state.gov/rss-feed/east-asia-and-the-pacific/feed/
https://www.state.gov/rss-feed/economic-energy-agricultural-and-trade-issues/feed/
https://www.state.gov/rss-feed/direct-line-to-american-business/feed/
https://www.state.gov/rss-feed/press-releases/feed/
https://www.state.gov/rss-feed/

TypeError: 'NoneType' object is not iterable