# Stock Data Scraping

This code is to show how to scrap stock data through yfinance, merge and clean them

## List of companies

In [None]:
import pandas as pd

# Example with 2 sectors
data = [
    # Technology sector
    {"Sector": "Technology", "Ticker": "AAPL", "Company Name": "Apple Inc."},
    {"Sector": "Technology", "Ticker": "MSFT", "Company Name": "Microsoft Corporation"},
    {"Sector": "Technology", "Ticker": "GOOGL", "Company Name": "Alphabet Inc."},
    {"Sector": "Technology", "Ticker": "NVDA", "Company Name": "NVIDIA Corporation"},
    {"Sector": "Technology", "Ticker": "ADBE", "Company Name": "Adobe Inc."},

    # Healthcare sector
    {"Sector": "Healthcare", "Ticker": "PFE", "Company Name": "Pfizer Inc."},
    {"Sector": "Healthcare", "Ticker": "JNJ", "Company Name": "Johnson & Johnson"},
    {"Sector": "Healthcare", "Ticker": "LLY", "Company Name": "Eli Lilly and Company"},
    {"Sector": "Healthcare", "Ticker": "MRK", "Company Name": "Merck & Co., Inc."},
    {"Sector": "Healthcare", "Ticker": "ABT", "Company Name": "Abbott Laboratories"}
    
    # Add more names if you'd like
]

# Convert to DataFrame and save
df = pd.DataFrame(data)
output_path = "/mnt/data/stock_sector_list_example.csv"
df.to_csv(output_path, index=False)

output_path


## List of Tickers

In [None]:

# List of all my tickers
tickers = [
    #Technology
    'AAPL', 'MSFT', 'AMD', 'ADBE', 'AVGO', 'INTC', 'CRM', 'CSCO', 'NVDA', 'ORCL', 
    'QCOM', 'PLTR', 'IBM', 'NOW', 'TXN', 'CDNS', 'ASML', 'GFS', 'APP', 'ANSS', 
    
    #Healthcare
    'ABBV', 'ABT', 'AMGN', 'JNJ', 'PFE', 'LLY', 'MRK', 'GEHC', 'ISRG', 'MDT', 
    'DHR', 'GILD', 'BIIB', 'DXCM', 'IDXX', 'BMY', 'TMO', 'AZN', 'HCA', 'CVS', 
    
    #Financials
    'AIG', 'AXP', 'BAC', 'BK', 'C', 'GS', 'JPM', 'MA', 'MET', 'MS', 
    'PYPL', 'SCHW', 'USB', 'V', 'BRK.B', 'COF', 'BLK', 'TROW', 'PGR', 'FITB', 
    
    #Consumer Discretionary
    'AMZN', 'TSLA', 'HD', 'MCD', 'BKNG', 'GM', 'LOW', 'NKE', 'SBUX', 'TGT', 
    'DIS', 'ABNB', 'YUM', 'ROST', 'TJX', 'ULTA', 'F', 'EBAY', 'ETSY', 'BBY', 
    
    #Consumer Staples
    'CL', 'COST', 'KO', 'MDLZ', 'MO', 'PG', 'PEP', 'WMT', 'KDP', 'CLX', 
    'KR', 'UNFI', 'GIS', 'TSN', 'SYY', 'ADM', 'KMB', 'EL', 'HSY', 'CHD',
    
    #Energy
    'COP', 'CVX', 'XOM', 'FANG', 'BKR', 'EOG', 'OXY', 'PSX', 'HAL', 'SLB', 
    'VLO', 'HES', 'DVN', 'MPC', 'APA', 'CPE', 'SM', 'MTDR', 'PDCE',  'PXD',
    
    #Industrials
    'BA', 'CAT', 'DE', 'EMR', 'FDX', 'GD', 'HON', 'LMT', 'UNP', 'UPS', 
    'CSX', 'ADP', 'MMM', 'RTX', 'GE', 'SWK', 'ITW', 'ETN', 'IR', 'PCAR', 
    
    #Communication Services
    'DIS','GOOG', 'GOOGL', 'META', 'CMCSA', 'T', 'TMUS', 'CHTR', 'NFLX', 'VZ', 
    'SIRI', 'PARA', 'FOXA', 'WBD', 'TTWO', 'ATVI', 'LYV', 'BIDU', 'NTES', 'SPOT', 
    
    #Utilities
    'DUK', 'NEE', 'SO', 'EXC', 'CEG', 'XEL', 'AEP', 'ES', 'D', 'NRG', 
    'PPL', 'PEG', 'ED', 'EVRG', 'EIX', 'WEC', 'AWK', 'ATO', 'SRE', 'CMS', 
    
    #Real Estate
    'AMT', 'CSGP', 'SPG', 'WELL', 'O', 'PLD', 'BXP', 'EQIX', 'PSA', 'EQR', 
    'VNO', 'SLG', 'AVB', 'FRT', 'OHI', 'DLR', 'HST', 'WY', 'IRM', 'ARE', 
    
    #Materials
    'LIN', 'DD', 'ECL', 'FCX', 'NEM', 'APD', 'MOS', 'PPG', 'RPM', 'CE', 
    'EMN', 'ALB', 'VMC', 'CF', 'TREX', 'MLM', 'IFF', 'NUE', 'AVY', 'BALL']


### Fetch Meta info of all companies (sector by sector)

This script allows you to fetch companies' meta info, such as sector, industry, marketcap, dividend yield, etc... you can fetch all the tickers at the same time, or sector by sector and save them individually to your local disk so that you can merge the data with the day trading data (see next script) to do analysis from there. 

In [1]:
import yfinance as yf
import pandas as pd
import os
    
tickers = ['AMZN', 'TSLA', 'HD', 'MCD', 'BKNG', 'GM', 'LOW', 'NKE', 'SBUX', 'TGT', 
    'DIS', 'ABNB', 'YUM', 'ROST', 'TJX', 'ULTA', 'F', 'EBAY', 'ETSY', 'BBY']
metadata_list = []

for ticker in tickers:
    print(f"Fetching metadata for {ticker}...")
    try:
        t = yf.Ticker(ticker)
        info = t.info
        selected_info = {
            "ticker": ticker,
            "sector": info.get("sector"),
            "industry": info.get("industry"),
            "marketCap": info.get("marketCap"),
            "beta": info.get("beta"),
            "dividendYield": info.get("dividendYield"),
            "trailingPE": info.get("trailingPE"),
            "forwardPE": info.get("forwardPE"),
            "earningsQuarterlyGrowth": info.get("earningsQuarterlyGrowth"),
            "fullTimeEmployees": info.get("fullTimeEmployees"),
            "country": info.get("country"),
            "website": info.get("website")
        }
        metadata_list.append(selected_info)
    except Exception as e:
        print(f"Error retrieving data for {ticker}: {e}")

df = pd.DataFrame(metadata_list)

# Save to specified path
output_path = r"/Users/xiejing/Desktop/Codeoptest/Personal_Project/Consumer_Discretionary_metadata.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Fetching metadata for AMZN...
Fetching metadata for TSLA...
Fetching metadata for HD...
Fetching metadata for MCD...
Fetching metadata for BKNG...
Fetching metadata for GM...
Fetching metadata for LOW...
Fetching metadata for NKE...
Fetching metadata for SBUX...
Fetching metadata for TGT...
Fetching metadata for DIS...
Fetching metadata for ABNB...
Fetching metadata for YUM...
Fetching metadata for ROST...
Fetching metadata for TJX...
Fetching metadata for ULTA...
Fetching metadata for F...
Fetching metadata for EBAY...
Fetching metadata for ETSY...
Fetching metadata for BBY...
Saved to /Users/xiejing/Desktop/Codeoptest/Personal_Project/Consumer_Discretionary_metadata.csv


### Day Trading Info

In [27]:
import yfinance as yf
import os

tickers = ['AIG', 'AXP', 'BAC', 'BK', 'C', 'GS', 'JPM', 'MA', 'MET', 'MS',
           'PYPL', 'SCHW', 'USB', 'V', 'BRK.B', 'COF', 'BLK', 'TROW', 'PGR', 'FITB']
save_path = r"/Users/xiejing/Desktop/Codeoptest/Personal_Project"

os.makedirs(save_path, exist_ok=True)

for ticker in tickers:
    print(f"Downloading {ticker}...")
    try:
        df = yf.download(ticker, start="2020-01-01", end="2025-05-20")
        if not df.empty:
            df.to_csv(os.path.join(save_path, f"{ticker}.csv"))
            print(f"Saved: {ticker}.csv")
        else:
            print(f"No data for {ticker}")
    except Exception as e:
        print(f"Error downloading {ticker}: {e}")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloading AIG...
Saved: AIG.csv
Downloading AXP...
Saved: AXP.csv
Downloading BAC...
Saved: BAC.csv
Downloading BK...
Saved: BK.csv
Downloading C...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Saved: C.csv
Downloading GS...
Saved: GS.csv
Downloading JPM...
Saved: JPM.csv
Downloading MA...
Saved: MA.csv
Downloading MET...
Saved: MET.csv
Downloading MS...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Saved: MS.csv
Downloading PYPL...
Saved: PYPL.csv
Downloading SCHW...
Saved: SCHW.csv
Downloading USB...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Saved: USB.csv
Downloading V...
Saved: V.csv
Downloading BRK.B...
No data for BRK.B
Downloading COF...
Saved: COF.csv
Downloading BLK...



[*********************100%***********************]  1 of 1 completed


Saved: BLK.csv
Downloading TROW...
Saved: TROW.csv
Downloading PGR...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Saved: PGR.csv
Downloading FITB...
Saved: FITB.csv





### Merge

In [28]:
import pandas as pd
import os

tickers = ['AIG', 'AXP', 'BAC', 'BK', 'C', 'GS', 'JPM', 'MA', 'MET', 'MS',
           'PYPL', 'SCHW', 'USB', 'V', 'BRK.B', 'COF', 'BLK', 'TROW', 'PGR', 'FITB']
folder_path = r"/Users/xiejing/Desktop/Codeoptest/Personal_Project"
combined = []

for ticker in tickers:
    file_path = os.path.join(folder_path, f"{ticker}.csv")
    try:
        df = pd.read_csv(file_path)
        df['Ticker'] = ticker
        combined.append(df)
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")

if combined:
    merged_df = pd.concat(combined, ignore_index=True)
    output_path = os.path.join(folder_path, "20_Financials_companies_nasdq_trading_data_2020.01-2025.05.csv")
    merged_df.to_csv(output_path, index=False)
    print(f"Saved merged CSV to {output_path}")
else:
    print("No data files found or loaded.")


Failed to read /Users/xiejing/Desktop/Codeoptest/Personal_Project/BRK.B.csv: [Errno 2] No such file or directory: '/Users/xiejing/Desktop/Codeoptest/Personal_Project/BRK.B.csv'
Saved merged CSV to /Users/xiejing/Desktop/Codeoptest/Personal_Project/20_Financials_companies_nasdq_trading_data_2020.01-2025.05.csv
