In [2]:
from newspaper import Article
from datasets import Dataset
import pandas as pd
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()
HF_KEY = os.getenv("HF_KEY")
login(HF_KEY)

def scrape_articles_to_dataset(article_urls):
    data = {
        "title": [],
        "text": [],
    }
    
    for url in article_urls:
        try:
            # Create an Article object
            article = Article(url, language='ar')  # Assuming Urdu or Arabic
            article.download()
            article.parse()
            
            # Add title and text to data
            data["title"].append(article.title)
            data["text"].append(article.text)
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")
    
    # Create a Hugging Face dataset
    dataset = Dataset.from_dict(data)
    
    return dataset

def append_and_push_dataset(article_urls, hub_repo):
    try:
        existing_dataset = load_dataset(hub_repo)
        existing_df = pd.DataFrame(existing_dataset['train'])
    except Exception as e:
        print(f"Failed to load dataset from {hub_repo}: {e}")
        existing_df = pd.DataFrame(columns=["title", "text"])

    # Step 2: Scrape new articles
    new_dataset = scrape_articles_to_dataset(article_urls)
    
    # Convert the new dataset to a DataFrame for easier appending
    new_df = pd.DataFrame(new_dataset)

    # Step 3: Append the new rows to the existing dataset
    updated_df = pd.concat([existing_df, new_df], ignore_index=True)

    # Step 4: Convert the updated DataFrame back to a Hugging Face Dataset
    updated_dataset = Dataset.from_pandas(updated_df)
    
    # Step 5: Push the updated dataset back to Hugging Face Hub
    updated_dataset.push_to_hub(hub_repo)

article_urls = [
    "https://www.dawnnews.tv/news/1211983/",
    "https://www.dawnnews.tv/news/1210750/",
    "https://www.dawnnews.tv/news/1229650/",
    "https://www.dawnnews.tv/news/1231088/",
    "https://www.dawnnews.tv/news/1232240/",
    "https://www.dawnnews.tv/news/1233196/",
    "https://www.dawnnews.tv/news/1235444/",
    "https://www.dawnnews.tv/news/1237275/",
    "https://www.dawnnews.tv/news/1238520/",
    "https://www.dawnnews.tv/news/1240097/",
    "https://www.dawnnews.tv/news/1241122/",
    "https://www.dawnnews.tv/news/1242449/",
    "https://www.dawnnews.tv/news/1213887/",
    "https://www.dawnnews.tv/news/1216002/",
    "https://www.dawnnews.tv/news/1219234/",
    "https://www.dawnnews.tv/news/1220682/",
    "https://www.dawnnews.tv/news/1223739/",
    "https://www.dawnnews.tv/news/1226105/",
    "https://www.dawnnews.tv/news/1242916/",
    "https://www.dawnnews.tv/news/1239589/",
    "https://beta.dawnnews.tv/news/1243259/",
    "https://jang.com.pk/news/838178",
    "https://jang.com.pk/news/838179",
    "https://jang.com.pk/news/835958",
    "https://jang.com.pk/news/748613",
    "https://jang.com.pk/news/747791",
    "https://jang.com.pk/news/747789",
    "https://jang.com.pk/news/746618",
    "https://jang.com.pk/news/746617",
    "https://jang.com.pk/news/745566",
    "https://jang.com.pk/news/745565",
    "https://jang.com.pk/news/744725",
    "https://jang.com.pk/news/744724",
    "https://jang.com.pk/news/744308",
    "https://jang.com.pk/news/742741",
    "https://jang.com.pk/news/742740",
    "https://jang.com.pk/news/741004",
    "https://jang.com.pk/news/741002",
    "https://jang.com.pk/news/739919",
    "https://jang.com.pk/news/739918",
    "https://jang.com.pk/news/738992",
    "https://jang.com.pk/news/738555",
    "https://jang.com.pk/news/736486",
    "https://jang.com.pk/news/736485",
    "https://jang.com.pk/news/733866",
    "https://jang.com.pk/news/734174",
    "https://jang.com.pk/news/734172",
    "https://jang.com.pk/news/733800",
    "https://jang.com.pk/news/733799",
    "https://jang.com.pk/news/733798",
    "https://jang.com.pk/news/733797",
    "https://jang.com.pk/news/732565",
    "https://jang.com.pk/news/732558",
    "https://jang.com.pk/news/731085",
    "https://jang.com.pk/news/731083",
    "https://jang.com.pk/news/730271",
    "https://jang.com.pk/news/730270",
    "https://jang.com.pk/news/729896",
    "https://jang.com.pk/news/729895",
    "https://jang.com.pk/news/728457",
 
]

hub_repo = "KANZOO/scrapped_articles"  # Replace with your Hugging Face repository

append_and_push_dataset(article_urls, hub_repo)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Kanza Nasim\.cache\huggingface\token
Login successful


Downloading readme: 100%|██████████| 305/305 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 353k/353k [00:01<00:00, 298kB/s]
Generating train split: 100%|██████████| 85/85 [00:00<00:00, 3083.22 examples/s]


Failed to scrape https://jang.com.pk/news/733797: Article `download()` failed with HTTPSConnectionPool(host='jang.com.pk', port=443): Read timed out. on URL https://jang.com.pk/news/733797


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 68.98ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.35s/it]
