In [1]:
from market_growth_analysis.web_scraping import scrape_company_data, process_sector, process_prices_company, get_sector_data, get_industry_data, parser
import concurrent
import tqdm

from bs4 import BeautifulSoup
import requests

import yfinance as yf
import pandas as pd
import os
from os.path import join
from datetime import datetime
from pathlib import Path 

# Extract Financial Data


In [None]:
URL = "https://www.macrotrends.net/stocks/research"
page = requests.get(URL)

sheets = ("income-statement",
          "cash-flow-statement",
          "balance-sheet",
          "financial-ratios")


soup = BeautifulSoup(page.content, "html.parser")

results = soup.find_all("a", href= lambda text: "/stocks/sector/" in text.lower())
sector_dict = dict()

for result in results:
    sector_dict[result.text] = ("".join(("https://www.macrotrends.net", result["href"]))).strip()

for sheet in sheets:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        print("Retrieving income statement")
        financial_sheet = sheet
        financial_sheet_df = pd.DataFrame()

        for sector in tqdm.tqdm(sector_dict, desc="Sectors"):
            results = process_sector(sector, sector_dict, financial_sheet, executor)
            financial_sheet_df = pd.concat([financial_sheet_df, results], ignore_index=True)

        financial_sheet_df.to_csv(f'../../data/raw_01/{financial_sheet}.csv')
    


In [None]:
financial_sheet_df

# Extract Prices

In [8]:
path = '../../data/raw_01/income-statement.csv'
companys = pd.read_csv(path).sort_values(by='ticker')['ticker'].dropna().unique()

In [9]:
companys

array(['A', 'AA', 'AAC', ..., 'ZYME', 'ZYNE', 'ZYXI'], dtype=object)

In [10]:
import concurrent.futures
from tqdm import tqdm
import logging

# Create a ThreadPoolExecutor
executor = concurrent.futures.ThreadPoolExecutor()

# List to store the future objects
futures = []

prices = pd.DataFrame()

# Iterate over the companies and submit the tasks to the executor
for company in companys:
    future = executor.submit(process_prices_company, company)
    futures.append(future)

# Use tqdm to track the progress of the loop
results = []
with tqdm(total=len(companys)) as pbar:
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result is not None:
            results.append(result)
        pbar.update(1)

# Combine the results
prices = pd.concat(results, ignore_index=True)


  0%|          | 8/5554 [00:01<10:31,  8.78it/s]  

- AAQC: No data found, symbol may be delisted


  1%|          | 53/5554 [00:05<10:19,  8.88it/s]

- ACH: No data found, symbol may be delisted


  7%|▋         | 380/5554 [00:35<09:16,  9.30it/s]

- ARCK: No data found, symbol may be delisted


  9%|▉         | 511/5554 [00:47<08:42,  9.65it/s]

- AVAN: No data found, symbol may be delisted


 18%|█▊        | 985/5554 [01:31<05:28, 13.93it/s]

- CEA: No data found, symbol may be delisted


 19%|█▊        | 1029/5554 [01:35<07:50,  9.62it/s]

- CHA: No data found, symbol may be delisted


 19%|█▉        | 1045/5554 [01:36<07:32,  9.97it/s]

- CHL: No data found, symbol may be delisted


 19%|█▉        | 1062/5554 [01:38<06:10, 12.13it/s]

- CIAN: No data found, symbol may be delisted
- CHU: No data found, symbol may be delisted


 23%|██▎       | 1293/5554 [01:58<05:09, 13.77it/s]

- CSLT: No data found, symbol may be delisted


 24%|██▎       | 1313/5554 [02:00<05:56, 11.89it/s]

- CTK: No data found, symbol may be delisted


 24%|██▍       | 1351/5554 [02:04<05:12, 13.43it/s]

- CVIA: No data found, symbol may be delisted


 26%|██▌       | 1425/5554 [02:11<05:19, 12.94it/s]

- DCMYY: No data found, symbol may be delisted


 26%|██▌       | 1456/5554 [02:14<06:06, 11.18it/s]

- DGI: No data found for this date range, symbol may be delisted


 27%|██▋       | 1503/5554 [02:18<05:39, 11.95it/s]

- DNK: No data found, symbol may be delisted


 28%|██▊       | 1568/5554 [02:24<06:11, 10.72it/s]

- DVD: No data found, symbol may be delisted


 30%|███       | 1675/5554 [02:34<05:37, 11.49it/s]

- EMWP: No data found, symbol may be delisted


 31%|███       | 1717/5554 [02:38<06:32,  9.78it/s]

- EQOS: No data found, symbol may be delisted


 33%|███▎      | 1860/5554 [02:51<06:56,  8.87it/s]

- FELP: No data found, symbol may be delisted


 34%|███▍      | 1906/5554 [02:55<07:44,  7.85it/s]

- FLLC: No data found, symbol may be delisted


 35%|███▌      | 1957/5554 [03:00<06:58,  8.60it/s]

- FRANQ: No data found, symbol may be delisted


 36%|███▌      | 1997/5554 [03:03<04:31, 13.09it/s]

- FTDCQ: No data found, symbol may be delisted


 37%|███▋      | 2068/5554 [03:10<06:53,  8.44it/s]

- GEENQ: No data found, symbol may be delisted


 39%|███▉      | 2158/5554 [03:18<04:10, 13.56it/s]

- GNC: No data found, symbol may be delisted


 41%|████▏     | 2293/5554 [03:30<05:53,  9.22it/s]

- HCR: No data found, symbol may be delisted


 45%|████▌     | 2511/5554 [03:50<03:56, 12.89it/s]

- IFIT: No data found, symbol may be delisted


 46%|████▌     | 2556/5554 [03:54<04:21, 11.48it/s]

- INAPQ: No data found, symbol may be delisted


 47%|████▋     | 2592/5554 [03:58<04:08, 11.94it/s]

- INTS: No data found, symbol may be delisted


 49%|████▉     | 2715/5554 [04:09<03:44, 12.65it/s]

- JP: No data found, symbol may be delisted
- JRJC: No data found, symbol may be delisted
- JONE: No data found, symbol may be delisted


 51%|█████▏    | 2856/5554 [04:22<05:21,  8.40it/s]

- LAIX: No data found, symbol may be delisted


 52%|█████▏    | 2876/5554 [04:24<03:39, 12.20it/s]

- LBPS: No data found, symbol may be delisted


 53%|█████▎    | 2918/5554 [04:27<04:06, 10.71it/s]

- LGV: No data found, symbol may be delisted


 56%|█████▌    | 3124/5554 [04:46<02:37, 15.48it/s]

- MDLY: No data found, symbol may be delisted


 58%|█████▊    | 3195/5554 [04:53<04:15,  9.25it/s]

- MKD: No data found, symbol may be delisted


 60%|█████▉    | 3321/5554 [05:04<03:46,  9.87it/s]

- MTFB: No data found, symbol may be delisted


 60%|█████▉    | 3329/5554 [05:05<03:28, 10.65it/s]

- MTL: No data found, symbol may be delisted


 60%|██████    | 3337/5554 [05:06<03:29, 10.57it/s]

- MUDS: No data found, symbol may be delisted


 62%|██████▏   | 3421/5554 [05:13<03:32, 10.02it/s]

- NEW: No data found, symbol may be delisted


 64%|██████▎   | 3535/5554 [05:24<03:20, 10.09it/s]

- NTP: No data found, symbol may be delisted


 64%|██████▍   | 3580/5554 [05:28<03:18,  9.95it/s]

- NWHM: No data found, symbol may be delisted


 65%|██████▍   | 3610/5554 [05:30<02:15, 14.33it/s]

- NYX: No data found, symbol may be delisted


 66%|██████▌   | 3640/5554 [05:33<02:32, 12.58it/s]

- ODT: No data found, symbol may be delisted


 68%|██████▊   | 3766/5554 [05:45<03:05,  9.65it/s]

- PACD: No data found, symbol may be delisted


 69%|██████▉   | 3830/5554 [05:51<02:54,  9.88it/s]

- PDLI: No data found, symbol may be delisted


 73%|███████▎  | 4040/5554 [06:10<02:46,  9.07it/s]

- PTHN: No data found for this date range, symbol may be delisted


 73%|███████▎  | 4072/5554 [06:13<02:35,  9.55it/s]

- PYX: No data found, symbol may be delisted


 80%|███████▉  | 4423/5554 [06:45<02:27,  7.67it/s]

- SFUN: No data found, symbol may be delisted


 80%|████████  | 4456/5554 [06:47<02:03,  8.92it/s]

- SHLDQ: No data found, symbol may be delisted
- SHI: No data found, symbol may be delisted


 82%|████████▏ | 4569/5554 [06:58<01:49,  9.00it/s]

- SNDEQ: No data found, symbol may be delisted


 83%|████████▎ | 4618/5554 [07:02<01:42,  9.17it/s]

- SPEL: No data found, symbol may be delisted


 88%|████████▊ | 4911/5554 [07:28<00:43, 14.68it/s]

- TLGT: No data found, symbol may be delisted


 91%|█████████▏| 5072/5554 [07:43<00:36, 13.12it/s]

- UDFI: No data found, symbol may be delisted


 96%|█████████▌| 5317/5554 [08:05<00:22, 10.68it/s]

- WARR: No data found, symbol may be delisted


 96%|█████████▌| 5344/5554 [08:08<00:16, 12.54it/s]

- WEI: No data found, symbol may be delisted


 97%|█████████▋| 5409/5554 [08:14<00:12, 11.86it/s]

- WPGGQ: No data found, symbol may be delisted


100%|█████████▉| 5538/5554 [08:26<00:01, 11.82it/s]

- ZME: No data found, symbol may be delisted
- ZNH: No data found, symbol may be delisted


100%|██████████| 5554/5554 [08:27<00:00, 10.95it/s]


In [11]:
# Export the dataframe
filepath = Path(f'../../data/raw_01/prices.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
prices.to_csv(filepath)