In [1]:
import requests
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup 

## Crawl data from Goodreads

In [7]:
goodread_df = pd.read_csv(r'D:\Online_Learning\Practical_DL\final_project\books.csv', on_bad_lines='skip')

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed


def fetch_data_for_isbn(isbn):
    return get_reviews_from_isbn(url, isbn)


def get_reviews_from_isbn(url, isbn):
    full_url = url+f"{isbn}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Retry mechanism
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(full_url, headers=headers, allow_redirects=True, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                break
        except requests.RequestException as e:
            print(f"Error fetching data for ISBN {isbn}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
        else:
            print(f"Failed to fetch data after {attempt + 1} attempts. Status Code: {response.status_code}")
            return pd.DataFrame()  # Return empty DataFrame if unsuccessful

    # Parsing logic as before
    reviews = soup.find_all('article', class_='ReviewCard')
    all_reviews = []

    for review in reviews:
        review_data = parse_review(review, isbn)  # Modularize parsing into a function
        all_reviews.append(review_data)

    return pd.DataFrame(all_reviews)


def parse_review(review, isbn):
    try:
        reviewer_name = review.find('div', {'data-testid': 'name'}).get_text(strip=True)
        rating_section = review.find('div', class_='ShelfStatus')
        rating = rating_section.find('span', {'role': 'img'}).get('aria-label', '').split()[1]
        comment_section = review.find('div', {'data-testid': 'contentContainer'})
        comment = comment_section.get_text(strip=True) if comment_section else "No comment provided"
    except Exception as e:
        print(f"Error parsing review for ISBN {isbn}: {e}")
        return {'isbn': isbn, 'reviewer': '', 'rating': '', 'comment': ''}
    
    return {
        'isbn': isbn,
        'reviewer': reviewer_name,
        'rating': rating,
        'comment': comment
    }


def fetch_reviews_parallel(isbn_list):
    reviews_df = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Reduced number of workers to lessen the load
        future_to_isbn = {executor.submit(fetch_data_for_isbn, isbn): isbn for isbn in isbn_list}
        for future in as_completed(future_to_isbn):
            isbn = future_to_isbn[future]
            try:
                data = future.result()
                reviews_df.append(data)
                print(f"Data fetched for ISBN {isbn} with shape {data.shape}")
                time.sleep(1)  # Throttle requests
            except Exception as exc:
                print(f"{isbn} generated an exception: {exc}")
    return reviews_df


In [None]:
url = f"https://www.goodreads.com/book/isbn/"
isbn_list = goodread_df['isbn'].unique()

reviews_dataframes = fetch_reviews_parallel(isbn_list)

In [35]:
test = pd.concat(reviews_dataframes)
test = test[test['reviewer']!=""]
test.to_csv('reviews_goodread.csv', index=False)

In [36]:
test = pd.read_csv(r'D:\Online_Learning\Practical_DL\final_project\reviews_goodread.csv')
test.shape

(273442, 4)

In [37]:
test.head()

Unnamed: 0,isbn,reviewer,rating,comment
0,439358078,Jayson,5,(A) 86%| ExtraordinaryNotes:An angsty apprehen...
1,439358078,Navessa,5,"Seriously, don't read this review if you haven..."
2,439358078,Diane ϟ [ Lestrange ],5,Interview with JK Rowling...Stephen Fry:Can we...
3,439358078,Jayson,5,(A) 86%| ExtraordinaryNotes:It's a transitiona...
4,439358078,Hannah Azerang,5,I had to re read it. I was in such a nostalgic...


## Crawl news from ABC

In [10]:
from bs4 import BeautifulSoup

# Your HTML content
html_content = '''<div class="article__content" data-editable="content" itemprop="articleBody" data-reorderable="content">
                    <div data-uri="cms.cnn.com/_components/source/instances/clyiud13t002igyqi7yuc9krn@published" class="source inline-placeholder" data-article-gutter="true">
    <cite class="source__cite">
      <span class="source__location" data-editable="location">New York</span>
      <span class="source__text" data-editable="source">CNN</span>
        &nbsp;—&nbsp;
    </cite>
</div>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiud13t002jgyqi08cudl1e@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Stop &amp; Shop is closing 32 underperforming grocery stores across the US northeast as part of the company’s efforts to improve its financial performance.
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiw4coi00303b6kcyy5k40t@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Shoppers are also being squeezed by&nbsp;higher prices at the grocery store, with food prices experiencing modest upticks last month, according to the <a href="https://www.cnn.com/2024/07/11/economy/us-cpi-consumer-inflation-june/index.html">newest inflation report.</a>
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiumq0y00033b6k03plsc3o@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            “Stop &amp; Shop has evaluated its overall store portfolio and made the difficult decision to close underperforming stores to create a healthy base for the future growth of our brand,” said the chain’s president Gordon Reid in a release.
    </p><div data-uri="cms.cnn.com/_components/ad-slot-dynamic/instances/sharethrough@published" class="ad-slot-dynamic ad-slot-dynamic--1" data-placement="{&quot;mobile&quot;:{&quot;position&quot;:5},&quot;desktop&quot;:{&quot;position&quot;:3}}" data-unselectable="true" style="display: none !important;">
        <div data-uri="cms.cnn.com/_components/ad-slot/instances/cnn-v1@published" class="ad-slot" data-path="end/ad-slot-dynamic[0]/items" data-desktop-slot-id="ad_nat_btf_01" data-mobile-slot-id="ad_nat_btf_01" data-unselectable="true" style="display: none !important;"><div id="ad_nat_btf_01" class="ad" style="display: none !important;"></div>
        <div class="ad-slot__feedback ad-feedback-link-container">
            <div class="ad-slot__ad-label"></div>
             
  <div data-ad-type="DISPLAY" data-ad-identifier="ad_nat_btf_01" class="ad-feedback-link">
    <div class="ad-feedback-link__label">Ad Feedback</div>
  </div>
            
        </div>
    </div>

    </div>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiuhtjk00013b6k661hhban@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Ahold Delhaize, the chain’s Dutch parent company, revealed the locations Friday adding that they will close in early November. The closures come amid a broader shake up in the US grocery industry including the rise of low-cost grocery brand Aldi, which plans to open <a href="https://www.cnn.com/2024/03/07/business/aldi-expansion/index.html">open 800 locations.</a> In addition, Kroger and Albertson’s announced plans for a <a href="https://www.cnn.com/2024/02/27/investing/takeaways-supermarket-merger-ftc/index.html">$25 billion merger</a> earlier this year. The Federal Trade Commission has sued to block it.
    </p>

  <div data-uri="cms.cnn.com/_components/related-content/instances/clyiwbti100373b6kdnduinfg@published" class="related-content related-content--article" data-article-gutter="true">
      <a class="related-content__link" href="/2024/03/07/business/aldi-expansion">
            <div class="related-content__image image__related-content">
        <div data-uri="cms.cnn.com/_components/image/instances/clthf5fkw002053qgc1gw35hf@published" class="image image__hide-placeholder image--eq-extra-small" data-image-variation="image" data-name="USATSI_21739161.jpg" data-component-name="image" data-observe-resizes="" data-breakpoints="{&quot;image--eq-extra-small&quot;: 115, &quot;image--eq-small&quot;: 300}" data-original-ratio="0.648" data-original-height="1944" data-original-width="3000" data-url="https://media.cnn.com/api/v1/images/stellar/prod/usatsi-21739161.jpg?c=original" data-editable="settings">
      
    <div class="image__container " data-image-variation="image" data-breakpoints="{&quot;image--eq-extra-small&quot;: 115, &quot;image--eq-small&quot;: 300, &quot;image--show-credits&quot;: 525}">
       <picture class="image__picture"><img src="https://media.cnn.com/api/v1/images/stellar/prod/usatsi-21739161.jpg?c=16x9&amp;q=h_144,w_256,c_fill" alt="Aldi in Loxahatchee Groves, Fla." class="image__dam-img" onload="this.classList.remove('image__dam-img--loading')" onerror="imageLoadError(this)" height="1944" width="3000" loading="lazy"></picture>
    </div>
    
    
      <div class="image__metadata">
        <div itemprop="caption" class="image__caption attribution">
  
  <span data-editable="metaCaption" class="inline-placeholder">Aldi in Loxahatchee Groves, Fla.</span>
  
</div>
        <figcaption class="image__credit">Lannis Waters/USA Today Network</figcaption>
      </div>
    
</div>

    </div>
    <p class="related-content__headline">
      
      <span class="related-content__title-text" data-editable="content.title">Related article</span>
      <span class="related-content__headline-text" data-editable="content.headline">Aldi plans to open 800 new locations in the US</span>
    </p>
      </a>
</div>


    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiupl3z00053b6kty38kapv@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Ahold Delhaize announced in May at its investor day that store reductions were planned, but didn’t specify locations. The affected stores span all five states it has locations in, including 10 in New Jersey, eight in Massachusetts, seven in New York, five in Connecticut plus two locations in Rhode Island.
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiux5t400073b6kkxw4hkgw@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Stop &amp; Shop said it “remains committed to serving its communities through other store locations, online shopping and home delivery services.” Employees impacted by the closures will be offered other opportunities in the company.
    </p><div data-uri="cms.cnn.com/_components/ad-slot-dynamic/instances/outstream@published" class="ad-slot-dynamic ad-slot-dynamic--1" data-placement="{&quot;mobile&quot;:{&quot;position&quot;:7},&quot;desktop&quot;:{&quot;position&quot;:6}}" data-unselectable="true" style="display: none !important;">
        <div class="ad-slot-dynamic__close"></div>
        <div data-uri="cms.cnn.com/_components/ad-slot/instances/cnn-v1@published" class="ad-slot" data-path="end/ad-slot-dynamic[1]/items" data-desktop-slot-id="ad_out_vid_01" data-mobile-slot-id="ad_out_vid_01" data-unselectable="true" style="display: none !important;"><div id="ad_out_vid_01" class="ad" style="display: none !important;"></div>
        <div class="ad-slot__feedback ad-feedback-link-container">
            <div class="ad-slot__ad-label"></div>
             
  <div data-ad-type="DISPLAY" data-ad-identifier="ad_out_vid_01" class="ad-feedback-link">
    <div class="ad-feedback-link__label">Ad Feedback</div>
  </div>
            
        </div>
    </div>

    </div>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyiv97hm00093b6kekjcgodd@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Following the closures, the 110-year-old chain said it will “continue to have a strong presence across its five-state footprint with more than 350 stores.” Stop &amp; Shop has remodeled about half of its locations since 2018, with those refreshed stores “outperforming” the ones that haven’t been updated.
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivmyc8001d3b6kj4h74cf5@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            In addition to operating Stop &amp; Shop, Ahold Delhaize owns Food Lion and Giant grocery stores in the US.
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivex2e000e3b6kacavyu5o@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            Here’s where the locations are closing:
    </p>

  <h2 class="subheader" data-editable="text" data-uri="cms.cnn.com/_components/subheader/instances/clyivfhl2000l3b6k6tmly0lf@published" data-component-name="subheader" id="connecticut" data-article-gutter="true">
        Connecticut
</h2>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivf6rf000i3b6kiijmtvfq@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            100 Division St., Ansonia
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivfv9k000n3b6kflwt2ed2@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            72 Newtown Road, Danbury
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivfwfs000p3b6k4jvmzy7n@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            855 Bridgeport Ave., Milford
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivgab5000r3b6kc7zsrexo@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            1937 West Main St., Stamford
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivgk9i000t3b6kqdxrb58b@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            211 High St., Torrington
    </p>

  <h2 class="subheader" data-editable="text" data-uri="cms.cnn.com/_components/subheader/instances/clyivh0jo000x3b6ke0vml5i8@published" data-component-name="subheader" id="massachusetts" data-article-gutter="true">
        Massachusetts
</h2>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivgolc000v3b6kt1hr6dbl@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            932 North Montello St., Brockton
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivj7tx000z3b6ko8pk9dx6@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            341 Plymouth St., Halifax
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivjoin00113b6kkt5wu7ng@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            165 Needham St., Newton
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivlabz00133b6k9inbmz66@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            24 Mattakeesett St., Pembroke
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivlk0v00153b6k0vf4lx6k@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            36 New State Highway, Raynham
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivlwa400173b6kem6ooort@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            539-571 Boston Turnpike, Shrewsbury
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivm5tv00193b6k4fdy6ttf@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            415 Cooley St., Springfield
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivmkg8001b3b6k6qfxmhq4@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            545 Lincoln St. Worcester
    </p>

  <h2 class="subheader" data-editable="text" data-uri="cms.cnn.com/_components/subheader/instances/clyivplaa001k3b6krggrmhwb@published" data-component-name="subheader" id="new-jersey" data-article-gutter="true">
        New Jersey
</h2>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivpfwp001i3b6k4ne7w4n0@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            625 Paterson Ave., Carlstadt
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivpygn001m3b6ke6qmyjtv@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            1083 Inman Ave., Edison
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivq1li001o3b6kkyt78ze9@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            1049 US Highway 1 South, Edison
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivq8ec001q3b6kxusnln3g@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            1221 State Route 27, Franklin Township
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivqfby001s3b6k9jlno8g9@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            4861 US Highway 9, Howell
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivqnvy001u3b6ke1d3oefo@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            2275 West County Line Rd., Jackson
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivqtbt001w3b6kuvgzp5e3@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            1278 US Highway 22, Phillipsburg
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivr4aa001y3b6k0snlm1om@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            581 Stelton Rd., Piscataway
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivrcvw00203b6kd5dglux2@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            505 Richmond Ave, Point Pleasant Beach
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivrk6z00223b6k7gept1pb@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            130 Skyline Dr., Ringwood
    </p>

  <h2 class="subheader" data-editable="text" data-uri="cms.cnn.com/_components/subheader/instances/clyivsyee002c3b6ksw8tt70u@published" data-component-name="subheader" id="new-york" data-article-gutter="true">
        New York
</h2>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivrqms00243b6khrtkbsnk@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            2965 Cropsey Ave., Brooklyn
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivt1sm002e3b6koba38nn4@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            294 Middle Country Road, Coram
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivu26x002g3b6kpy22skkc@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            2525 Hempstead Turnpike, East Meadow
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivu995002i3b6kqwg55j5v@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            130 Wheatley Plaza, Greenvale
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivufx9002k3b6kyovq99pb@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            132 Fulton Ave., Hempstead
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivul6w002m3b6klx6pks15@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            240 East Sanford Blvd., Mt. Vernon
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivuso8002o3b6kaa3zopzg@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            7 Samsondale Plaza, West Haverstraw
    </p>

  <h2 class="subheader" data-editable="text" data-uri="cms.cnn.com/_components/subheader/instances/clyivw8uq002u3b6kbo9wbgas@published" data-component-name="subheader" id="rhode-island" data-article-gutter="true">
        Rhode Island
</h2>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivvvxp002q3b6kfnj9cn8p@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            11 Commerce Way, Johnston
    </p>

    <p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-uri="cms.cnn.com/_components/paragraph/instances/clyivvzp5002s3b6k7qmned8l@published" data-editable="text" data-component-name="paragraph" data-article-gutter="true">
            176 Pittman St., Providence (Eastside Marketplace)
    </p>

                </div>
'''

# Use Beautiful Soup to parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Find all paragraph tags with the specified class
paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph-primary-core-light')

# Extract and print the text from each paragraph
for paragraph in paragraphs:
    print(paragraph.get_text(strip=True))


Stop & Shop is closing 32 underperforming grocery stores across the US northeast as part of the company’s efforts to improve its financial performance.
Shoppers are also being squeezed by higher prices at the grocery store, with food prices experiencing modest upticks last month, according to thenewest inflation report.
“Stop & Shop has evaluated its overall store portfolio and made the difficult decision to close underperforming stores to create a healthy base for the future growth of our brand,” said the chain’s president Gordon Reid in a release.
Ahold Delhaize, the chain’s Dutch parent company, revealed the locations Friday adding that they will close in early November. The closures come amid a broader shake up in the US grocery industry including the rise of low-cost grocery brand Aldi, which plans to openopen 800 locations.In addition, Kroger and Albertson’s announced plans for a$25 billion mergerearlier this year. The Federal Trade Commission has sued to block it.
Ahold Delhaize

In [85]:
# Provided HTML snippet
# html_snippet = '''
# <a href="/2024/07/11/business/uaw-biden-fain/index.html" class="container__link container__link--type-article container_lead-plus-headlines__link" data-link-type="article" data-zjs="click" data-zjs-cms_id="cms.cnn.com/_pages/cl9iplp6y00002vnyxpeejbzw@published" data-zjs-canonical_url="https://www.cnn.com/business" data-zjs-zone_id="cms.cnn.com/_components/zone/instances/cl9iplp9y00142vny1uj75le3@published" data-zjs-zone_name="undefined" data-zjs-zone_type="zone_layout--wide-left-balanced-2" data-zjs-zone_position_number="1" data-zjs-zone_total_number="9" data-zjs-container_id="cms.cnn.com/_components/container/instances/clh6oomd2001l3b6dj5xigaia@published" data-zjs-container_name="undefined" data-zjs-container_type="container_lead-plus-headlines" data-zjs-container_position_number="2" data-zjs-container_total_number="2" data-zjs-card_id="cms.cnn.com/_components/card/instances/clh6oomd2001l3b6dj5xigaia_fill_3@published" data-zjs-card_name="UAW chief and other board members have concerns about Biden’s ability to beat Trump, source says" data-zjs-card_type="card" data-zjs-card_position_number="3" data-zjs-card_total_number="14">
#     <div class="container__text container_lead-plus-headlines__text">
#         <div class="container__headline container_lead-plus-headlines__headline">
#             <span class="container__headline-text" data-editable="headline">UAW chief and other board members have concerns about Biden’s ability to beat Trump, source says</span>
#         </div>
#     </div>
# </a>
# '''

# # Use BeautifulSoup to parse the HTML
# soup = BeautifulSoup(html_snippet, 'html.parser')

# # Find the <a> tag
# a_tag = soup.find('a')

# # Extract the href attribute
# href = a_tag['href']
# print("Extracted href:", href)

full_url = "https://edition.cnn.com/business"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

def get_links_from_cnn(full_url, headers, num_limit):
    response = requests.get(full_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    for item in soup.find_all('a', class_='container__link', limit=num_limit):
        links.append(item['href'])

    return ["https://edition.cnn.com" + i for i in links]


def get_texts_from_links(links, headers):
    txt_lists = []
    for url in links:
        print(url)
        txt_tmp = []
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all paragraph tags with the specified class
        paragraphs = soup.find_all('p', class_='paragraph inline-placeholder vossi-paragraph-primary-core-light')

        # Extract and print the text from each paragraph
        for paragraph in paragraphs:
            txt_tmp.append(paragraph.get_text(strip=True))
        
        txt_tmp = ' '.join(txt_tmp)
        txt_lists.append(txt_tmp)

    return txt_lists

In [7]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed


def get_text_from_link(url, headers):
    print(f"Fetching from: {url}")
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text from paragraphs
    paragraphs = soup.find_all('p')
    return ' '.join(paragraph.get_text(strip=True) for paragraph in paragraphs)


def get_links_from_sgtimes(category_sgt, headers, num_limit):
    full_url = "https://thesaigontimes.vn/" + category_sgt
    
    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = soup.find_all('h3', class_='entry-title td-module-title', limit=num_limit)
    href_link = []
    for link in links:
        try:
            href_link.append(link.find('a')['href'])
            time.sleep(1)
        except:
            print("Not Found")

    return href_link

def get_links_from_cnn(category, headers, num_limit):
    full_url = "https://edition.cnn.com" + category
    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    for item in soup.find_all('a', class_='container__link', limit=num_limit):
        links.append(item['href'])
        time.sleep(1)

    return [full_url + i for i in links]


def get_texts_from_links_parallel(links, headers):
    # Use ThreadPoolExecutor to run get_text_from_link in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(get_text_from_link, url, headers): url for url in links}
        results = []
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                results.append(result)
                print(f"Completed fetching from: {url}")
                time.sleep(1)
            except Exception as exc:
                print(f"{url} generated an exception: {exc}")
        return results


def run_parallel_processing(links, headers):
    # Get link and texts
    texts = get_texts_from_links_parallel(links, headers)  # This processes links in parallel
    
    # Convert to dataframe
    df = pd.DataFrame.from_dict(dict(zip(links,texts)), orient='index').reset_index()
    df.columns = ['url','txt']
    
    return df


# TEST WITH CNN
category = 'business' # ['business','economy','investing','tech']
category_sgt = 'tai-chinh-ngan-hang' # ['tai-chinh-ngan-hang','kinh-doanh']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
num_limit = 10
links_cnn = get_links_from_cnn(category, headers, num_limit)
links_sgt = get_links_from_sgtimes(category_sgt, headers, num_limit)
df_txt = run_parallel_processing(links_cnn+links_sgt, headers)

ConnectionError: HTTPSConnectionPool(host='edition.cnn.combusiness', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000285BFF0CF40>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [3]:
import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
full_url = "https://www.cnn.com" + 'tech'
response = requests.get(full_url, headers=headers, timeout=10)

ConnectionError: HTTPSConnectionPool(host='www.cnn.comtech', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001AF2DBCA7F0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.robotparser import RobotFileParser
import random
import logging

# Define a custom logging handler to capture log records in a list
class ListHandler(logging.Handler):
    def __init__(self):
        super().__init__()
        self.log_records = []

    def emit(self, record):
        self.log_records.append(record)

# Set up basic logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create an instance of the custom handler and add it to the root logger
list_handler = ListHandler()
logging.getLogger().addHandler(list_handler)

# User-agent rotation
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0',
]

def check_robots_txt(url):
    rp = RobotFileParser()
    rp.set_url(url + "/robots.txt")
    rp.read()
    return rp

def get_links_from_sgtimes(category_sgt, headers, num_limit, rp):
    full_url = "https://thesaigontimes.vn/" + category_sgt

    if not rp.can_fetch(headers['User-Agent'], full_url):
        logging.warning(f"Fetching from {full_url} is disallowed by robots.txt")
        return []

    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = soup.find_all('h3', class_='entry-title td-module-title', limit=num_limit)
    href_link = []
    for link in links:
        try:
            href_link.append(link.find('a')['href'])
            time.sleep(1)
        except:
            logging.error("Not Found")

    return list(set(href_link))

def get_links_from_cnn(category, headers, num_limit, rp):
    full_url = "https://edition.cnn.com" + category

    if not rp.can_fetch(headers['User-Agent'], full_url):
        logging.warning(f"Fetching from {full_url} is disallowed by robots.txt")
        print(f"Fetching from {full_url} is disallowed by robots.txt")
        return []

    response = requests.get(full_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = set()
    page_number = 1

    while len(links) < num_limit:
        response = requests.get(full_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        for item in soup.find_all('a', class_='container__link'):
            links.add("https://edition.cnn.com" + item['href'])
            if len(links) >= num_limit:
                break

        page_number += 1
        time.sleep(1)  # Throttle requests

    return list(links)[:num_limit]

def get_text_from_link(url, headers, failed_links):
    logging.info(f"Fetching from: {url}")
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            logging.error(f"Failed to fetch data from {url}, status code: {response.status_code}")
            failed_links.append(url)
            return ""
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        return ' '.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
    except Exception as e:
        logging.error(f"Exception occurred while fetching from {url}: {e}")
        failed_links.append(url)
        return ""

def get_texts_from_links_parallel(links, headers):
    failed_links = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(get_text_from_link, url, headers, failed_links): url for url in links}
        results = []
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                results.append(result)
                logging.info(f"Completed fetching from: {url}")
                time.sleep(random.uniform(1, 3))  # Random delay to throttle requests
            except Exception as exc:
                logging.error(f"{url} generated an exception: {exc}")
                failed_links.append(url)
        return results, failed_links

def run_parallel_processing(links, headers):
    texts, failed_links = get_texts_from_links_parallel(links, headers)
    df = pd.DataFrame.from_dict(dict(zip(links, texts)), orient='index').reset_index()
    df.columns = ['url', 'txt']
    return df, failed_links

if __name__ == "__main__":
    category = '/business'  # Example category for CNN
    category_sgt = 'tai-chinh-ngan-hang'  # Example category for Saigon Times

    headers = {
        'User-Agent': random.choice(user_agents)
    }
    num_limit = 10

    # Check robots.txt
    cnn_rp = check_robots_txt("https://edition.cnn.com")
    sgt_rp = check_robots_txt("https://thesaigontimes.vn")

    links_cnn = get_links_from_cnn(category, headers, num_limit, cnn_rp)
    links_sgt = get_links_from_sgtimes(category_sgt, headers, num_limit, sgt_rp)

    df_txt, failed_links = run_parallel_processing(links_sgt+links_cnn, headers)
    df_txt.to_csv('scraped_texts.csv', index=False)

    # Log failed links
    if failed_links:
        logging.info(f"Failed to fetch texts from the following links: {failed_links}")
    else:
        logging.info("Successfully fetched texts from all links.")

    # Retrieve error log content
    error_logs = [record.getMessage() for record in list_handler.log_records if record.levelno == logging.ERROR]
    print("Error Logs:\n", "\n".join(error_logs))


2024-07-28 13:24:02,028 - INFO - Fetching from: https://thesaigontimes.vn/gia-do-la-my-giam-trong-sang-26-7/
2024-07-28 13:24:02,034 - INFO - Fetching from: https://thesaigontimes.vn/mo-loi-huy-dong-von-cho-doanh-nghiep-nho-va-vua/
2024-07-28 13:24:02,038 - INFO - Fetching from: https://thesaigontimes.vn/de-doanh-nghiep-khong-bo-ngo-truoc-doi-hoi-nang-hang-thi-truong/
2024-07-28 13:24:02,041 - INFO - Fetching from: https://thesaigontimes.vn/co-phieu-hbc-va-hng-bi-huy-niem-yet-bat-buoc/
2024-07-28 13:24:02,045 - INFO - Fetching from: https://thesaigontimes.vn/cho-vay-tra-no-ngan-hang-khac-canh-tranh-ngay-cang-quyet-liet/
2024-07-28 13:24:02,048 - INFO - Fetching from: https://thesaigontimes.vn/vi-sao-viec-giu-tran-lai-suat-do-la-my-0-lai-quan-trong/
2024-07-28 13:24:02,049 - INFO - Fetching from: https://edition.cnn.com/2024/07/27/business/apple-union-labor-agreement/index.html
2024-07-28 13:24:02,053 - INFO - Fetching from: https://edition.cnn.com/2024/07/27/tech/tiktok-response-to-us-

Error Logs:
 


In [2]:
test = pd.read_csv('scraped_texts.csv')

In [9]:
test['txt'][7]

'Markets Hot Stocks Fear & Greed Index Latest Market News Hot Stocks Apple and the union representing retail workers at its store in Towson, Maryland, agreed to a tentative labor deal late Friday in the first US labor agreement not only for an Apple store but for any US workers of the tech giant. Workers at the Apple store in Towson hadvoted to join the International Association of Machinists unionin June 2022 and have since been seeking their first contract. In May, theyvoted to authorize a strikewithout providing a deadline. The labor deal, which needs to be ratified by a vote of the 85 rank-and-file members at the store before it can take effect, is a significant milestone. Other high-profile union organizing efforts, such as those atStarbucksandAmazon, have yet to produce deals for those workers, even though workers at those companies voted to join unions well before the workers at the Apple store in Maryland. There are not many legal requirements to force a company to reach a labo