1. import every client and parse examples
2. collect examples in dataframes
3. combine dataframes
4. save to csv

In [22]:
import pandas as pd

from services.news_fetcher import fetcher

# Yahoo Finance News Parsing

### Example: Fetching News for a Single Ticker

In [30]:
res = fetcher.get_ticker_news("GOOGL", 5, with_content=True)
example_article = res[0]

print("Article title:", example_article["title"])
print("Article content:", example_article['content'][:100], "...")  # Print first 200 characters

Article title: Explainer-Will the EU delay enforcing its AI Act?
Article content: By Supantha MukherjeeSTOCKHOLM (Reuters) -With less than a month to go before parts of the European  ...


# Fetching News for top tickers in S&P 500
We randomly sample 200 tickers to avoid rate limiting issues.

In [31]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tables = pd.read_html(url)

sp500_table = tables[0]
sp500_tickers = sp500_table[['Symbol', 'Security']].sample(frac=1, random_state=42).reset_index(drop=True)
print(sp500_tickers.head(10))
tickers = sp500_tickers['Symbol'].tolist()[:200]


  Symbol              Security
0      K             Kellanova
1    BRO         Brown & Brown
2    LIN             Linde plc
3    DTE            DTE Energy
4   CINF  Cincinnati Financial
5    LHX              L3Harris
6    RTX       RTX Corporation
7    GLW          Corning Inc.
8   BKNG      Booking Holdings
9   IDXX    Idexx Laboratories


### Create a DataFrame to store the news articles

Note: Not all requests return articles with content, in some cases errors are returned.

In [38]:
data = {
    "ticker": [],
    "title": [],
    "link": [],
    "content": [],
    "summary": []
}

In [None]:
from tqdm import tqdm


from concurrent.futures import ThreadPoolExecutor, as_completed

def fetch_news_for_ticker(ticker):
    try:
        res = fetcher.get_ticker_news(ticker, 10, with_content=True)
        if res:
            return {
                "ticker": [ticker] * len(res),
                "title": [article["title"] for article in res],
                "link": [article["link"] for article in res],
                "summary": [article["summary"] for article in res],
                "content": [article["content"] for article in res]
            }
        else:
            print(f"Ticker: {ticker}, No articles found.")
            return None
    except Exception as e:
        print(f"Error fetching news for {ticker}: {e}")
        return None


# Use ThreadPoolExecutor to fetch news concurrently (faster)
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(fetch_news_for_ticker, ticker): ticker for ticker in tickers}
    for future in tqdm(as_completed(futures), total=len(tickers), desc="Tickers"):
        result = future.result()
        if result:
            data["ticker"].extend(result["ticker"])
            data["title"].extend(result["title"])
            data["link"].extend(result["link"])
            data["summary"].extend(result["summary"])
            data["content"].extend(result["content"])


In [17]:
import pandas as pd

yf_df = pd.DataFrame(data)
yf_df = yf_df.drop_duplicates(subset=["title", "link"])
yf_df = yf_df[yf_df["content"].notna()]

# clean content and summary from none english characters
def clean_text(text):
    # Replace non-ASCII characters with a space
    return ''.join(char if char.isascii() else ' ' for char in text).replace("\n", " ").replace("\r", " ").strip()

yf_df["content"] = yf_df["content"].apply(clean_text)

yf_df.reset_index(drop=True, inplace=True)
yf_df.head()

Unnamed: 0,ticker,title,link,content,summary
0,LIN,Citi upgrades Linde on project wins and produc...,https://finance.yahoo.com/news/citi-upgrades-l...,Investing.com -- Citigroup raised its rating o...,Investing.com -- Citigroup raised its rating o...
1,LIN,"Circle initiated, Disney upgraded: Wall Street...",https://finance.yahoo.com/news/circle-initiate...,The most talked about and market moving resear...,"Circle initiated, Disney upgraded: Wall Street..."
2,LIN,Linde Signs Long-Term Agreement to Supply Indu...,https://finance.yahoo.com/news/linde-signs-lon...,"WOKING, England, June 23, 2025--(BUSINESS WIRE...","WOKING, England, June 23, 2025--Linde (Nasdaq:..."
3,LIN,Linde Publishes 2024 Sustainable Development R...,https://finance.yahoo.com/news/linde-publishes...,"WOKING, England, June 20, 2025--(BUSINESS WIRE...","WOKING, England, June 20, 2025--Linde (Nasdaq:..."
4,LIN,Linde plc's (NASDAQ:LIN) Recent Stock Performa...,https://finance.yahoo.com/news/linde-plcs-nasd...,Most readers would already know that Linde's (...,Most readers would already know that Linde's (...


In [18]:
yf_df.to_csv("yahoo_finance_news_5.csv", index=False)

In [19]:
yf_df = pd.read_csv("yahoo_finance_news_5.csv")

In [20]:
len(yf_df)

1395

# Indian Financial News Parsing

In [35]:
df = pd.read_csv("hf://datasets/kdave/Indian_Financial_News/training_data_26000.csv")
len(df)

26961

In [36]:
# get the rows with the words "stock" or "market" in the content
df = df[df["Content"].str.contains("stock", case=False, na=False)]
len(df)

8825

In [37]:
df.head()

Unnamed: 0,URL,Content,Summary,Sentiment
5,https://www.moneycontrol.com/news/business/mar...,Wall Street's main indexes fell on Wednesday a...,main indexes fall after jerome p. o'connell wa...,Negative
8,https://www.moneycontrol.com/news/business/mar...,Foreign institutional investors (FIIs) have tu...,foreign institutional investors (FIIs) have tu...,Negative
9,https://economictimes.indiatimes.com/news/econ...,"Three meetings, two hotels, one city. These we...",competition commission of india investigated a...,Negative
13,https://www.moneycontrol.com/news/business/mar...,"Rahul Jain\n\nThe Indian economy, despite all ...",the economic slowdown is taking the center-sta...,Negative
16,http://www.financialexpress.com/market/shares-...,Indian stock markets are likely to open lower ...,"early indicator of NSE Nifty, SGX Nifty Future...",Negative


In [None]:
df = df[["Content", "Summary", "URL"]]
df.columns = ["content", "summary", "link"] # rename columns to match the Yahoo Finance format
df = df.drop_duplicates(subset=["link"])

df = df[df["content"].notna()]

## Combine Yahoo Finance and Indian Financial News Dataframes

In [33]:
import glob

csv_files = glob.glob("yahoo_finance_news*.csv")
dfs = [pd.read_csv(f) for f in csv_files]
yf_df = pd.concat(dfs, ignore_index=True)
yf_df.drop_duplicates(subset=["title"], inplace=True)
yf_df = yf_df[yf_df["content"].notna()]
yf_df.reset_index(drop=True, inplace=True)
yf_df = yf_df[["content", "summary", "link"]]

yf_df.head()

Unnamed: 0,content,summary,link
0,"Amazon.com, Inc. (NASDAQ:AMZN) is one of theJi...","Amazon.com, Inc. (NASDAQ:AMZN) is one of the J...",https://finance.yahoo.com/news/amazon-com-inc-...
1,Investing.com -- Citi placed Trade Desk (NASDA...,Investing.com -- Citi placed Trade Desk (NASDA...,https://finance.yahoo.com/news/citi-sees-upsid...
2,"WASHINGTON,July 1, 2025/PRNewswire/ -- TheNati...",The National Safety Council estimates 437 peop...,https://finance.yahoo.com/news/nsc-warns-437-p...
3,Momentum investing revolves around the idea of...,Does Walt Disney (DIS) have what it takes to b...,https://finance.yahoo.com/news/looking-top-mom...
4,Taking full advantage of the stock market and ...,The Zacks Style Scores offers investors a way ...,https://finance.yahoo.com/news/heres-why-walt-...


In [34]:
print("Parsed Yahoo Finance News Articles:", len(yf_df))

Parsed Yahoo Finance News Articles: 3027


In [31]:
# combine the 2 dataframes
df_combined = pd.concat([yf_df[["content", "summary","link"]], df], ignore_index=True)

In [32]:
len(df_combined)

10605

In [33]:
df_combined.to_csv("dataset.csv", index=False)