# Scraping the most active stocks from  Yahoo Finance:

* Visit the target URL :https://finance.yahoo.com/
* Hover over the Market menu
* Click on Trending Tickers
* Click on Most Active
* Ensure to visit all pages of stocks
* Scrape the necessary data

In [24]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# ----- SCRAPING THE DATA -----

driver = webdriver.Chrome()
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# function to check if webpage is fully loaded
def wait_for_page_to_load(driver, wait):
	page_title = driver.title
	try:
		wait.until(
			lambda d: d.execute_script("return document.readyState") == "complete"
		)
	except:
		print(f"The page \"{page_title}\" did not get fully loaded within the given duration.\n")
	else:
		print(f"The page \"{page_title}\" is fully loaded.\n")


url = "https://finance.yahoo.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)

# hovering on Markets menu
actions = ActionChains(driver)
markets_menu = wait.until(
	EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
actions.move_to_element(markets_menu).perform()

# click on Trending Tickers
trending_tickers = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
)
trending_tickers.click()
wait_for_page_to_load(driver, wait)

# click on Most Active
most_active = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
)
most_active.click()
wait_for_page_to_load(driver, wait)

# scraping the data
data = []
while True:
	# scraping data from the webpage
	wait.until(
		EC.presence_of_element_located((By.TAG_NAME, "table"))
	)
	rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
	for row in rows:
		values = row.find_elements(By.TAG_NAME, "td")
		stock = {
			"name": values[1].text,
			"symbol": values[0].text,
			"price": values[3].text,
			"change": values[4].text,
			"volume": values[6].text,
			"market_cap": values[8].text,
			"pe_ratio": values[9].text,
		}
		data.append(stock)

	# click next
	try:
		next_button = wait.until(
			EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
		)
	except:
		print("The \"next\" button is not clickable. We have navigated through all the pages.")
		break
	else:
		next_button.click()
		time.sleep(1)

driver.quit()


The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Most Active Stocks: US stocks with the highest trading volume today - Yahoo Finance" is fully loaded.

The "next" button is not clickable. We have navigated through all the pages.


In [25]:
data

[{'name': 'NVIDIA Corporation',
  'symbol': 'NVDA',
  'price': '110.93',
  'change': '+3.36',
  'volume': '313.417M',
  'market_cap': '2.707T',
  'pe_ratio': '37.73'},
 {'name': 'Lucid Group, Inc.',
  'symbol': 'LCID',
  'price': '2.5000',
  'change': '-0.0200',
  'volume': '162.01M',
  'market_cap': '7.579B',
  'pe_ratio': '-'},
 {'name': 'Ford Motor Company',
  'symbol': 'F',
  'price': '9.33',
  'change': '+0.19',
  'volume': '139.924M',
  'market_cap': '37.101B',
  'pe_ratio': '6.39'},
 {'name': 'Tesla, Inc.',
  'symbol': 'TSLA',
  'price': '252.24',
  'change': '-0.16',
  'volume': '128.948M',
  'market_cap': '811.333B',
  'pe_ratio': '123.65'},
 {'name': 'Intel Corporation',
  'symbol': 'INTC',
  'price': '19.74',
  'change': '-0.14',
  'volume': '128.426M',
  'market_cap': '86.078B',
  'pe_ratio': '-'},
 {'name': 'Palantir Technologies Inc.',
  'symbol': 'PLTR',
  'price': '88.55',
  'change': '-0.04',
  'volume': '95.131M',
  'market_cap': '207.682B',
  'pe_ratio': '466.05'},
 

In [26]:
len(data)

351

In [27]:
# Data Cleaning
stocks_df = (
	pd
	.DataFrame(data)
	.apply(lambda col: col.str.strip() if col.dtype == "object" else col)
	.assign(
		price=lambda df_: pd.to_numeric(df_.price),
		change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "")),
		volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "")),
		market_cap=lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T", "")) * 1000),
		pe_ratio=lambda df_: (
			df_
			.pe_ratio
			.replace("-", np.nan)
			.str.replace(",", "")
			.pipe(lambda col: pd.to_numeric(col))
		)
	)
	.rename(columns={
		"price": "price_usd",
		"volume": "volume_M",
		"market_cap": "market_cap_B"
	})
)

stocks_df.to_excel("yahoo-stocks-data.xlsx", index=False)

In [None]:
# ye check krne ke liye ki 0se leke 9 ya . ke alava or koi value present hai ya nhi

stocks_df.change.str.extract(r"([^0-9.])", expand=False)
stocks_df.change.str.extract(r"([^0-9.])", expand=False).unique()

In [29]:
#  restructuring the code:

import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

class StocksScraper:
	def __init__(self, driver, timeout=10):
		self.driver = driver
		self.wait = WebDriverWait(self.driver, timeout=timeout)
		self.data = []

    # wait while webpage loads
	def wait_for_page_to_load(self):
		page_title = self.driver.title
		try:
			self.wait.until(
				lambda d: d.execute_script("return document.readyState") == "complete"
			)
		except:
			print(f"The page \"{page_title}\" did not get fully loaded within the given duration.\n")
		else:
			print(f"The page \"{page_title}\" is fully loaded.\n")

	
    # access main url
	def access_url(self, url):
		self.driver.get(url)
		self.wait_for_page_to_load()


    # access most active stocks webpage
	def access_most_active_stocks(self):
		# hover to markets menu
		actions = ActionChains(self.driver)
		markets_menu = self.wait.until(
			EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
		)
		actions.move_to_element(markets_menu).perform()
		
		# click on Trending Tickers
		trending_tickers = self.wait.until(
			EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
		)
		trending_tickers.click()
		self.wait_for_page_to_load()
		
		# click on Most Active
		most_active = self.wait.until(
			EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
		)
		most_active.click()
		self.wait_for_page_to_load()

    
    # extract data from all pages
	def extract_stocks_data(self):
		# extract data from webpage
		while True:
			self.wait.until(
				EC.presence_of_element_located((By.TAG_NAME, "table"))
			)
			rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
			for row in rows:
				values = row.find_elements(By.TAG_NAME, "td")
				stock = {
					"name": values[1].text,
					"symbol": values[0].text,
					"price": values[3].text,
					"change": values[4].text,
					"volume": values[6].text,
					"market_cap": values[8].text,
					"pe_ratio": values[9].text,
				}
				self.data.append(stock)
		
			# click next
			try:
				next_button = self.wait.until(
					EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
				)
			except:
				print("The \"next\" button is not clickable. We have navigated through all the pages.")
				break
			else:
				next_button.click()
				time.sleep(1)


	def clean_and_save_data(self, filename="temp"):
		stocks_df = (
			pd
			.DataFrame(self.data)
			.apply(lambda col: col.str.strip() if col.dtype == "object" else col)
			.assign(
				price=lambda df_: pd.to_numeric(df_.price),
				change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "")),
				volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "")),
				market_cap=lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T", "")) * 1000),
				pe_ratio=lambda df_: (
					df_
					.pe_ratio
					.replace("-", np.nan)
					.str.replace(",", "")
					.pipe(lambda col: pd.to_numeric(col))
				)
			)
			.rename(columns={
				"price": "price_usd",
				"volume": "volume_M",
				"market_cap": "market_cap_B"
			})
		)
		stocks_df.to_excel(f"{filename}.xlsx", index=False)


if __name__ == "__main__":
	driver = webdriver.Chrome()
	driver.maximize_window()

	url = "https://finance.yahoo.com/"
	scraper = StocksScraper(driver, 5)

	scraper.access_url(url)
	scraper.access_most_active_stocks()
	scraper.extract_stocks_data()
	scraper.clean_and_save_data("yahoo_finance-stocks")

	driver.quit()

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.



TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF786801F55+78133]
	GetHandleVerifier [0x00007FF786801FB0+78224]
	(No symbol) [0x00007FF7865C91BA]
	(No symbol) [0x00007FF78661F19D]
	(No symbol) [0x00007FF78661F44C]
	(No symbol) [0x00007FF7866723D7]
	(No symbol) [0x00007FF78664719F]
	(No symbol) [0x00007FF78666F21F]
	(No symbol) [0x00007FF786646F33]
	(No symbol) [0x00007FF786610358]
	(No symbol) [0x00007FF7866110C3]
	GetHandleVerifier [0x00007FF786ACBA8D+3001453]
	GetHandleVerifier [0x00007FF786AC5E72+2977874]
	GetHandleVerifier [0x00007FF786AE497D+3103581]
	GetHandleVerifier [0x00007FF78681C7EA+186826]
	GetHandleVerifier [0x00007FF7868243FF+218591]
	GetHandleVerifier [0x00007FF786809D94+110452]
	GetHandleVerifier [0x00007FF786809F42+110882]
	GetHandleVerifier [0x00007FF7867F0379+5465]
	BaseThreadInitThunk [0x00007FFC6B347614+20]
	RtlUserThreadStart [0x00007FFC6D1426A1+33]
