# Data Extraction from `yhfinance`

In [1]:
import datetime
import yfinance as yf
import pandas as pd
import requests
from io import StringIO

In [2]:
# !pip install --upgrade yfinance

In [3]:
def SAndP500_Wikipedia_Scrape():
    """
    Fetch S&P500 tickers and corresponding data from Wikipedia
    """
    print("Fetching from Wikipedia...")
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Download HTML, parse it to find all tables and create corresponding pandas DataFrames and return list of DataFrames
    response = requests.get(url, headers=headers)
    tables = pd.read_html(StringIO(response.text))
    sp500_table = tables[0]
    
    # Create DataFrame with relevant info
    df = pd.DataFrame({
        'Ticker': sp500_table['Symbol'].tolist(),
        'Company': sp500_table['Security'].tolist(),
        'Sector': sp500_table['GICS Sector'].tolist(),
        'Industry': sp500_table['GICS Sub-Industry'].tolist()
    })
    
    return df

In [4]:
def NASDAQ100_Wikipedia_Scrape():
    """
    Fetch NASDAQ100 tickers and corresponding data from Wikipedia
    """
    print("Fetching from Wikipedia...")
    url = 'https://en.wikipedia.org/wiki/Nasdaq-100'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    # Download HTML, parse it to find all tables and create corresponding pandas DataFrames and return list of DataFrames
    response = requests.get(url, headers=headers)
    tables = pd.read_html(StringIO(response.text))

    # Find NASDAQ table
    for index, table in enumerate(tables):
        if 'Ticker' in table.columns and len(table) > 90:
            ndaq100_table = tables[index]
    
    # Create DataFrame with relevant info
    df = pd.DataFrame({
        'Ticker': ndaq100_table['Ticker'].tolist(),
        'Company': ndaq100_table['Company'].tolist(),
        'Sector': ndaq100_table['ICB Sector'].tolist(),
        'Industry': ndaq100_table['ICB Industry'].tolist()
    })
    
    return df

In [5]:
try:
    ndaq100df = NASDAQ100_Wikipedia_Scrape()
    ticker_list = ndaq100df['Ticker'].tolist()
    print(f" Successfully fetched {len(ndaq100df)} S&P 500 tickers!")
    
    # Display summary
    print("\n" + "="*60)
    print(f"Total tickers: {len(ndaq100df)}")
    print("\nFirst 10 tickers:")
    print(ndaq100df.head(10).to_string(index=False))
    
    print("\n" + "="*60)
    print("Sector Distribution:")
    print("="*60)
    print(ndaq100df['Sector'].value_counts())
    
except Exception as e:
    print(f"Python scrape failed: {e}")

Fetching from Wikipedia...
 Successfully fetched 102 S&P 500 tickers!

Total tickers: 102

First 10 tickers:
Ticker                 Company                 Sector                        Industry
  ADBE              Adobe Inc.             Technology               Computer Software
   AMD  Advanced Micro Devices             Technology                  Semiconductors
  ABNB                  Airbnb Consumer Discretionary Diversified Commercial Services
 GOOGL Alphabet Inc. (Class A) Communication Services               Computer Software
  GOOG Alphabet Inc. (Class C) Communication Services               Computer Software
  AMZN                  Amazon Consumer Discretionary  Catalog/Specialty Distribution
   AEP American Electric Power              Utilities              Electric Utilities
  AMGN                   Amgen            Health Care                   Biotechnology
   ADI          Analog Devices             Technology                  Semiconductors
  AAPL              Apple Inc. 

In [6]:
# ticker_list = ["AAPL", "META", "NDAQ", "SPY",]
# company_list = []

# for ticker_symbol in ticker_list:
#     try: 
#         stock = yf.Ticker(ticker_symbol)
#         company_name = stock.info.get('longName', 'N/A')
#         company_list.append(company_name)
#     except Exception as e:
#         print(f"Error fetching {ticker}: {e}")
#         company_list.append("Error")

# tick_comp_df = pd.DataFrame({
#     'Ticker': ticker_list,
#     'Company': company_list
# })

In [7]:
start_date = datetime.datetime(2022, 10, 29)
end_date = datetime.datetime(2025, 10, 29)
data = yf.download(ticker_list, start=start_date, end=end_date, progress=False)
data = data.stack(level='Ticker', future_stack=True).reset_index()
data.columns.name = None
print(data.head(10))

  data = yf.download(ticker_list, start=start_date, end=end_date, progress=False)


        Date Ticker       Close        High         Low        Open  \
0 2022-10-31   AAPL  150.957031  151.843054  149.559100  150.779836   
1 2022-10-31   ABNB  106.910004  113.800003  106.669998  113.059998   
2 2022-10-31   ADBE  318.500000  325.579987  317.420013  323.489990   
3 2022-10-31    ADI  135.263489  136.458506  133.347693  136.392109   
4 2022-10-31    ADP  226.601578  227.248477  224.501514  225.560916   
5 2022-10-31   ADSK  214.300003  216.289993  214.000000  214.759995   
6 2022-10-31    AEP   78.322495   79.632028   77.823627   79.596393   
7 2022-10-31   AMAT   85.900383   86.980341   85.287437   86.454955   
8 2022-10-31    AMD   60.060001   61.860001   59.529999   60.750000   
9 2022-10-31   AMGN  245.694199  247.021039  243.558512  244.649084   

       Volume  
0  97943200.0  
1  10733800.0  
2   3253200.0  
3   3078300.0  
4   1711400.0  
5    965000.0  
6   4104000.0  
7   6875400.0  
8  73274100.0  
9   3033600.0  


In [8]:
# Function to calculate RSI
def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

In [9]:
# Calculate all indicators using groupby
grouped = data.groupby('Ticker')

# RSI and change
data['RSI'] = grouped['Close'].transform(lambda x: calculate_rsi(x))
data['RSI_Chg'] = grouped['RSI'].diff()

# MACD
data['EMA_12'] = grouped['Close'].transform(lambda x: x.ewm(span=12, adjust=False).mean())
data['EMA_26'] = grouped['Close'].transform(lambda x: x.ewm(span=26, adjust=False).mean())
data['MACD'] = data['EMA_12'] - data['EMA_26']
data['MACD_Signal'] = grouped['MACD'].transform(lambda x: x.ewm(span=9, adjust=False).mean())
data['MACD_Histogram'] = data['MACD'] - data['MACD_Signal']
data = data.drop(['EMA_12', 'EMA_26'], axis=1)  # Clean up intermediate columns

# Rate of Change (10-day)
data['ROC'] = grouped['Close'].transform(lambda x: x.pct_change(periods=10) * 100)

# Simple Moving Averages by X days
for days in [10, 20, 50, 100, 150, 200, 250]:
    data[f'SMA_{days}'] = grouped['Close'].transform(lambda x: x.rolling(window=days).mean())

# Close X days ago
for days in [1, 2, 3, 4, 5]:
    data[f'Close_{days}days_ago'] = grouped['Close'].shift(days)

# Close change since yesterday
data['Close_Chg'] = grouped['Close'].diff()
data['Close_ChgPct'] = grouped['Close'].transform(lambda x: x.pct_change() * 100)

# Volume X days ago
for days in [1]:
    data[f'Volume_{days}d_ago'] = grouped['Volume'].shift(days)

# Volume change since yesterday
data['Volume_Chg'] = grouped['Volume'].diff()
data['Volume_ChgPct'] = grouped['Volume'].transform(lambda x: x.pct_change() * 100)

In [10]:
# Display sample
print("Sample data with indicators:")
print(data[data['Ticker'] == 'AAPL'].iloc[30:40][
    ['Date', 'Ticker', 'Close', 'RSI', 'MACD', 'SMA_20', 'Close_ChgPct', 'Volume_ChgPct']
])

print("\n\nAll columns:")
print(data.columns.tolist())

Sample data with indicators:
           Date Ticker       Close        RSI      MACD      SMA_20  \
3060 2022-12-13   AAPL  143.446960  42.356454 -1.263576  144.610538   
3162 2022-12-14   AAPL  141.218384  37.787421 -1.297651  144.273788   
3264 2022-12-15   AAPL  134.601700  33.843549 -1.837387  143.667836   
3366 2022-12-16   AAPL  132.639359  35.733127 -2.395857  142.868608   
3468 2022-12-19   AAPL  130.529129  36.714953 -2.974440  141.935764   
3570 2022-12-20   AAPL  130.460129  20.129046 -3.399353  141.161190   
3672 2022-12-21   AAPL  133.566284  27.979373 -3.445738  140.434933   
3774 2022-12-22   AAPL  130.391098  25.595196 -3.696104  139.506034   
3876 2022-12-23   AAPL  130.026199  26.261547 -3.879247  138.704831   
3978 2022-12-27   AAPL  128.221664  27.960229 -4.122478  138.005196   

      Close_ChgPct  Volume_ChgPct  
3060      0.678257      33.242411  
3162     -1.553589     -12.350058  
3264     -4.685427      20.221725  
3366     -1.457887      61.886004  
3468     

In [11]:
# Data from last day
lastday = data.loc[data.groupby('Ticker')['Date'].idxmax()]

In [12]:
# Save
excel_file = 'TADASI_yhfinance.xlsx'
with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
    ndaq100df.to_excel(writer, sheet_name='Tickers', index=False)
    data.to_excel(writer, sheet_name='OHLC', index=False)
    lastday.to_excel(writer, sheet_name='Last_Day', index=False)
print(f"\n Saved to {excel_file}")


 Saved to TADASI_yhfinance.xlsx


In [13]:
# https://github.com/ranaroussi/yfinance/issues/2469
# import curl_cffi
# session = curl_cffi.Session(impersonate="chrome", timeout=5)
# ticker = yf.Ticker('GBPEUR=X', session=session)
# data = ticker.history(start='2025-05-05', end='2025-05-07')