In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor

# Function to scrape a single page
def scrape_page(page):
    url = f"{base_url}{page}/"
    print(f"Fetching: {url}")
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        section = soup.find("section", class_="article-lists")
        page_data = []
        if section:
            all_links = section.find_all("a")
            for link in all_links:
                if link.find_parent("span", class_="author") or "page-link" in link.get("class", []):
                    continue  # Skip this link
                
                href = link.get("href")
                if href:
                    page_data.append({"page": page, "url": href})
        return page_data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

# Function to save data incrementally
def save_data(data, file_name="portfolio_links.csv"):
    df = pd.DataFrame(data)
    df.to_csv(file_name, sep="|", index=False)  # Using '|' as the separator
    print(f"Saved {len(data)} records to {file_name}")

# Parameters
base_url = "https://www.portfolio.hu/gazdasag?page="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
output_file = "portfolio_links.csv"

# Load existing data if any
try:
    existing_df = pd.read_csv(output_file, sep="|")  # Read with '|' as separator
    processed_pages = set(existing_df["page"])
    data = existing_df.to_dict(orient="records")
except FileNotFoundError:
    processed_pages = set()
    data = []

# Scraping in batches
start_page = 2
end_page = 2900
batch_size = 1  # Number of pages to fetch in parallel
with ThreadPoolExecutor(max_workers=batch_size) as executor:
    for batch_start in range(start_page, end_page, batch_size):
        batch_end = min(batch_start + batch_size, end_page)
        pages_to_scrape = [p for p in range(batch_start, batch_end) if p not in processed_pages]
        
        if not pages_to_scrape:
            continue

        futures = [executor.submit(scrape_page, page) for page in pages_to_scrape]
        for future in futures:
            result = future.result()
            data.extend(result)

        # Save progress after each batch
        save_data(data, output_file)
        time.sleep(5)  # Delay between batches


In [None]:
import pandas as pd

df = pd.read_csv("portfolio_links.csv", sep="|")

In [None]:
df

In [None]:
df = df.drop_duplicates(subset=['url'])

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df[df['url'].str.contains('portfolio', case=False)]

In [None]:
df.to_csv("portfolio_links.csv", sep="|", index=False)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

# Function to fetch content from a URL
def fetch_content(record):
    url = record['url']
    url_for_fetch = url + '?amp'
    print(f"Fetching content from: {url}")
    try:
        response = requests.get(url_for_fetch, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract the title (from the <h1> tag)
        title_tag = soup.find("h1")
        title = title_tag.get_text(strip=True) if title_tag else "Title not found"
    
        # Extract the main content
        content_div = soup.find("div", class_="pfarticle-section-content")
        lead_section = soup.find("section", class_="pfarticle-section-lead")
        
        if content_div and lead_section:
            content = lead_section.get_text(separator=" ", strip=True) + " " + content_div.get_text(separator=" ", strip=True)
        elif content_div:
            content = content_div.get_text(separator=" ", strip=True)
        elif lead_section:
            content = lead_section.get_text(separator=" ", strip=True)
        else:
            content = "Content not found"
            
        # Extract the date from the URL
        try:
            raw_date = url.split("gazdasag/")[1][:8]  # Extract the first 8 characters after 'gazdasag/'
            date = datetime.strptime(raw_date, "%Y%m%d").strftime("%Y/%m/%d")
        except (IndexError, ValueError):
            date = "Date not found"
            
        return {"url": url, "title": title, "content": content, "date": date}
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {"url": url, "error": str(e)}

import os

def save_fetched_data(data, file_name="portfolio_articles.csv"):
    try:
        # Save the data to the file
        df = pd.DataFrame(data)
        temp_file = file_name + ".tmp"  # Use a temporary file to ensure atomicity
        df.to_csv(temp_file, sep="|", index=False)
        
        # Verify the save
        if os.path.exists(temp_file):
            os.replace(temp_file, file_name)  # Replace the main file atomically
            print(f"Saved {len(data)} fetched records to {file_name}")
        else:
            raise IOError("Temporary file was not created.")
    
    except Exception as e:
        error_message = f"Failed to save data to {file_name}: {e}"
        print(error_message)
        with open("error_log.txt", "a") as log_file:
            log_file.write(f"{datetime.now()} - {error_message}\n")

# Parameters
input_file = "portfolio_links.csv"  # CSV with the links to fetch content from
output_file = "portfolio_articles.csv"  # CSV to save the fetched content
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Load existing data if any
try:
    existing_df = pd.read_csv(output_file, sep="|")
    fetched_urls = set(existing_df["url"])
    fetched_data = existing_df.to_dict(orient="records")
except FileNotFoundError:
    fetched_urls = set()
    fetched_data = []

# Function to load links from input CSV
def load_links(file_name=input_file):
    df = pd.read_csv(file_name, sep="|")
    return df.to_dict(orient="records")

# Load URLs to scrape
links_to_scrape = load_links()

# Determine the pages to scrape based on what has already been fetched
links_to_scrape = [link for link in links_to_scrape if link['url'] not in fetched_urls]

# Scraping sequentially
for record in links_to_scrape:
    result = fetch_content(record)
    fetched_data.append(result)
    
    # Save progress after each fetch
    save_fetched_data(fetched_data, output_file)
    
    time.sleep(5)  # Delay between requests


Fetching content from: https://www.portfolio.hu/gazdasag/20180226/megint-matolcsyekon-a-sor-ha-meglepetest-akarnak-okozni-277683
Saved 15012 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180226/kuszobon-europa-idei-legfontosabb-valasztasa-egymasra-licitalnak-a-partok-penzszoro-igeretekkel-277645
Saved 15013 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180225/oriasi-meglepetes-a-hodmezovasarhelyi-valasztason-itt-van-az-elso-elemzoi-reakcio-277805
Saved 15014 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180225/osszeall-merkel-kormanya-277803
Saved 15015 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180225/romaniaban-ezrek-tuntetnek-a-kormany-ellen-277799
Saved 15016 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/201802

Fetching content from: https://www.portfolio.hu/gazdasag/20180221/hatalmas-meglepetest-jelentett-be-trump-a-fegyvertartassal-kapcsolatban-277433
Saved 15056 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180221/bizakodo-a-fed-tenyleg-nagyon-jo-az-amerikai-gazdasag-277429
Saved 15057 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180221/elesitettek-a-bombat-amerikaban-ez-meg-a-dollart-is-kiutheti-277375
Saved 15058 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180221/brexit-uj-javaslattal-alltak-elo-a-britek-277377
Saved 15059 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180221/kiabrandito-grafikonok-a-magyar-felsooktatasrol-277281
Saved 15060 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180221/sokkolo-figyelmeztetes

Fetching content from: https://www.portfolio.hu/gazdasag/20180216/egyelore-nem-ajandekozta-meg-magyarorszagot-az-sp-276987
Saved 15101 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180216/tiz-ev-alatt-otvenezer-uj-munkahelyet-iger-a-miniszter-276979
Saved 15102 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180216/elozonlik-a-vilagot-a-kinaiak-a-jovo-heten-276975
Saved 15103 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180216/merkel-nem-buntetne-a-brexitet-276973
Saved 15104 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180216/megjelent-a-pallas-athene-konyvkiado-elso-kotete-276955
Saved 15105 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180216/nagy-az-esely-hogy-husvetig-vegre-felall-az-uj-nemet-kormany-276947
Sav

Fetching content from: https://www.portfolio.hu/gazdasag/20180213/megvan-az-uj-budapesti-amerikai-nagykovet-276521
Saved 15146 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180213/ingyenes-tomegkozlekedest-terveznek-merkelek-276513
Saved 15147 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180213/ovatossagot-iger-az-uj-fed-elnok-276495
Saved 15148 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180213/holnap-visszaterhet-a-fejetlenseg-a-tozsdekre-egy-szamra-figyel-az-egesz-vilag-276477
Saved 15149 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180213/a-munkaero-megmaradas-torvenye-sokkal-tobb-kell-a-beremelesnel-275773
Saved 15150 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180213/holnap-kiderul-az-ngm-nek-vagy-a-piac

Saved 15189 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180209/ket-evre-megmenekult-az-usa-trump-alairta-a-torvenyt-276233
Saved 15190 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180209/vagott-egyet-a-kamaton-az-orosz-jegybank-276225
Saved 15191 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180209/eldurvult-az-eu-brit-vita-megfenyegette-a-fotargylo-a-briteket-276217
Saved 15192 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180209/meglepetes-megsem-a-magyar-kormany-nagy-kritikusa-lesz-a-nemet-kulugyminiszter-276213
Saved 15193 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180209/a-szakadek-szelere-rantotta-europat-az-orszag-amely-most-tortenelmi-siker-kapujaban-all-276033
Saved 15194 fetched records to portfolio_

Fetching content from: https://www.portfolio.hu/gazdasag/20180206/husz-evet-is-kaphatnak-a-most-elkapott-afacsalok-275745
Saved 15235 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180206/panikol-a-vilag-elindult-a-toke-a-feltorekvo-piacokrol-275739
Saved 15236 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180206/merkel-is-megszolalt-a-piaci-zuhanasrol-275737
Saved 15237 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180206/szakad-minden-csak-semmi-panik-de-az-mnb-nek-mar-fajhat-egy-kicsit-a-feje-2-275729
Saved 15238 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180206/na-es-erre-most-mit-lepnek-matolcsyek-275727
Saved 15239 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180206/egy-hir-vargatol-ami-ma-mar-nem-is-olyan-

Saved 15279 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180131/sulyosbodik-a-gigasztrajk-nemetorszagban-oriasi-karokkal-riogatnak-a-cegvezetok-275275
Saved 15280 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180131/januartol-konnyebb-a-jogvitakat-per-nelkul-rendezni-275063
Saved 15281 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180131/nagyon-erosen-kezdte-az-evet-az-amerikai-munkaeropiac-275247
Saved 15282 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180131/nem-fosztogatnak-hanem-osztogatnak-magyarorszag-kizsakmanyolasarol-274917
Saved 15283 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180131/megy-a-harc-a-brexit-elol-menekulo-intezmenyert-275231
Saved 15284 fetched records to portfolio_articles.csv
Fetching c

Fetching content from: https://www.portfolio.hu/gazdasag/20180128/5-milliot-vagy-15-milliot-keresel-nagyon-nem-mindegy-hol-laksz-274825
Saved 15324 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180128/felminositette-a-fitch-feheroroszorszagot-274899
Saved 15325 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180128/szabadon-engedtek-a-szaudi-kiralyi-csalad-korrupcioval-vadolt-tagjait-274897
Saved 15326 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180128/egykor-magyarorszag-buszkesegei-voltak-mara-viszont-sulyos-valsaggal-kuzdenek-274833
Saved 15327 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180128/lemondott-a-zaklatassal-vadolt-republikanus-penzugyi-vezeto-274889
Saved 15328 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazd

Fetching content from: https://www.portfolio.hu/gazdasag/20180124/elorehozzak-az-egeszsegugyi-szakdolgozok-novemberi-beremeleset-274563
Saved 15369 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180124/melyben-a-munkanelkuliseg-sok-allas-all-uresen-274547
Saved 15370 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180124/uj-uzenetet-kuldott-a-magyar-kormany-brusszelnek-274541
Saved 15371 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180124/elkepeszto-formaban-van-europa-12-eves-csucson-egy-fontos-gazdasagi-mutato-274527
Saved 15372 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180124/szuper-eve-lesz-iden-a-vilagnak-274517
Saved 15373 fetched records to portfolio_articles.csv
Fetching content from: https://www.portfolio.hu/gazdasag/20180124/talaltak-meg-egy-kis-kiskereskede