In [48]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np

In [3]:
driver = webdriver.Chrome()
driver.maximize_window()

# Explicit Wait

wait = WebDriverWait(driver,3)

url = 'https://finance.yahoo.com/'
driver.get(url)

def wait_for_page_to_load(driver,wait):
    
    page_title = driver.title
    
    try :
        wait.until(lambda d:d.execute_script("return document.readyState")=='complete')
    except:
        print(f"Page {page_title} not loaded withhin given time")
    else:
        print(f"Page {page_title} loaded successfully")
        
wait_for_page_to_load(driver,wait)


"""     Step 1: wait.until(...)

wait is an instance of WebDriverWait(driver, timeout), which waits for a condition to be met before proceeding.

until(...) keeps checking a condition at regular intervals until it becomes True or the timeout is reached. 


        Step 2: lambda d: d.execute_script(...)

This is an anonymous function (lambda function) that takes d (the WebDriver instance) as an argument.

It executes JavaScript code in the browser using d.execute_script(...).

        Step 3: d.execute_script("return document.readyState")

execute_script runs JavaScript code in the browser.

        Step 4: Checking if document.readyState == "complete"

The condition inside until(...) evaluates:

"""

# Hovering the Market Menu

actions = ActionChains(driver)

# We first need to check whether market menu is available or loaded within wait time or not so we use exected condition

market_menu = wait.until(EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]')))

# Once the Selenium able to locate the market menu on yahoo finance web site we want to hover to the market menu

actions.move_to_element(market_menu).perform()

# Clicking on Trending Tickers

# Before clicking on Trending Tickers, we need make sure that trending tickers element is clickable

trending_ticker = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/trending-tickers')]")))
trending_ticker.click()

wait_for_page_to_load(driver, wait)

# We will be redirected to new page that contains trending tickers(stocks)
# Now, we need to see the most acvtive stocks so let's check if most active element is clickable

most_active = wait.until(EC.element_to_be_clickable((By.XPATH,"//span[text()='Most Active']")))
most_active.click()


# Extracting the Page Section Title (e.g., "Most Active Stocks")

try:
    section_title = wait.until(EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))).text 
    print(f"Page {section_title} Stocks loaded successfully")

except:
    print("Could not find the Most Active Stocks page title.")
    
# We need to Extract the stocks data from the Most Active Page
# There are 25 stocks in each page and we have total 253 stocks as of now
# So we need to click on next page until we are done with all the pages

# Scraping the data

# 1. Scrape the data
    # 1A. Check if the table is present
    # 1B. If available then scrap the data
# 2. Check if the arrow is clickable that navigate to the next page
# 3. If clickable then click

data = []

counter = 0

while True:

    wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    
    for row in rows:

        stock_details = row.find_elements(By.TAG_NAME,"td")
        
        most_active_stocks = [data.text for data in stock_details]

        # print(most_active_stocks)
        
        stocks = {"Company_Name":most_active_stocks[1],
                "Symbol": most_active_stocks[0],
			    "Price": most_active_stocks[3],
			    "Change": most_active_stocks[4],
                "Change_in_pct": most_active_stocks[5],
			    "Volume": most_active_stocks[6],
                "Avg_Vol_Per_3M":most_active_stocks[7],
			    "Market_Cap": most_active_stocks[8],
			    "PE_Ratio": most_active_stocks[9],
		}

        data.append(stocks)

        # Check for the Next Button

    try :
        next_button = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]')))
    
    except:
        print(f"The next_button is not clickable anymore. We have Scarped the data from the all the pages!")
        break
    
    # Click onto the Next Button
    
    else:
        next_button.click()
        time.sleep(2)
        counter +=1
        print(f"Successfully Scarped the Most Active Stock data from the page: {counter} ")

driver.close()

Page Yahoo Finance - Stock Market Live, Quotes, Business & Finance News loaded successfully
Page Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance loaded successfully
Page Most Active Stocks loaded successfully
Successfully Scarped the Most Active Stock data from the page: 1 
Successfully Scarped the Most Active Stock data from the page: 2 
Successfully Scarped the Most Active Stock data from the page: 3 
Successfully Scarped the Most Active Stock data from the page: 4 
Successfully Scarped the Most Active Stock data from the page: 5 
Successfully Scarped the Most Active Stock data from the page: 6 
Successfully Scarped the Most Active Stock data from the page: 7 
Successfully Scarped the Most Active Stock data from the page: 8 
Successfully Scarped the Most Active Stock data from the page: 9 
Successfully Scarped the Most Active Stock data from the page: 10 
The next_button is not clickable anymore. We have Scarped the data from the all the pages!


In [9]:
data

[{'Company_Name': 'Walmart Inc.',
  'Symbol': 'WMT',
  'Price': '109.67',
  'Change': '-1.76',
  'Change_in_pct': '-1.58%',
  'Volume': '227.836M',
  'Avg_Vol_Per_3M': '278.992M',
  'Market_Cap': '2.676T',
  'PE_Ratio': '37.30'},
 {'Company_Name': 'Ford Motor Company',
  'Symbol': 'F',
  'Price': '9.72',
  'Change': '-0.18',
  'Change_in_pct': '-1.82%',
  'Volume': '131.42M',
  'Avg_Vol_Per_3M': '98.88M',
  'Market_Cap': '38.652B',
  'PE_Ratio': '6.66'},
 {'Company_Name': 'Tesla, Inc.',
  'Symbol': 'TSLA',
  'Price': '263.55',
  'Change': '-9.58',
  'Change_in_pct': '-3.51%',
  'Volume': '122.382M',
  'Avg_Vol_Per_3M': '95.897M',
  'Market_Cap': '847.714B',
  'PE_Ratio': '129.83'},
 {'Company_Name': 'Palantir Technologies Inc.',
  'Symbol': 'PLTR',
  'Price': '85.85',
  'Change': '-4.24',
  'Change_in_pct': '-4.71%',
  'Volume': '90.381M',
  'Avg_Vol_Per_3M': '98.076M',
  'Market_Cap': '201.349B',
  'PE_Ratio': '451.84'},
 {'Company_Name': 'NIO Inc.',
  'Symbol': 'NIO',
  'Price': '3.7

In [10]:
len(data)

242

In [11]:
stocks_df = pd.DataFrame(
    data= data
)

stocks_df

Unnamed: 0,Company_Name,Symbol,Price,Change,Change_in_pct,Volume,Avg_Vol_Per_3M,Market_Cap,PE_Ratio
0,Walmart Inc.,WMT,109.67,-1.76,-1.58%,227.836M,278.992M,2.676T,37.30
1,Ford Motor Company,F,9.72,-0.18,-1.82%,131.42M,98.88M,38.652B,6.66
2,"Tesla, Inc.",TSLA,263.55,-9.58,-3.51%,122.382M,95.897M,847.714B,129.83
3,Palantir Technologies Inc.,PLTR,85.85,-4.24,-4.71%,90.381M,98.076M,201.349B,451.84
4,NIO Inc.,NIO,3.7500,-0.2300,-5.78%,78.4M,57.294M,8.132B,-
...,...,...,...,...,...,...,...,...,...
237,"The Interpublic Group of Companies, Inc.",IPG,26.29,-0.11,-0.42%,5.08M,6.747M,9.797B,14.37
238,Full Truck Alliance Co. Ltd.,YMM,12.90,-0.38,-2.86%,5.049M,7.728M,13.49B,31.46
239,"Moderna, Inc.",MRNA,31.12,-1.00,-3.11%,5.679M,11.432M,12.032B,-
240,"Conagra Brands, Inc.",CAG,26.55,+0.09,+0.34%,5.048M,5.917M,12.673B,25.78


In [12]:
stocks_df.dtypes

Company_Name      object
Symbol            object
Price             object
Change            object
Change_in_pct     object
Volume            object
Avg_Vol_Per_3M    object
Market_Cap        object
PE_Ratio          object
dtype: object

##### 1. Removing the Extra Spaces from the column name if available

##### 2. Rename the Price Column to Price_USD

##### 3. Checking the Data Quality of the Price_USD column. If there are any inconsistence value then remove it

In [55]:
stocks_df = (pd
        .DataFrame(data= data)
        .apply(lambda col:col.str.strip() if col.dtype =='object' else col)
        .assign(
            Price_USD       =  lambda df_:pd.to_numeric(df_.Price) ,
            Change          =  lambda df_:pd.to_numeric(df_.Change.str.replace("+","")) ,        # lets remove + sign from the change column as it is redundant
            Change_in_pct   =  lambda df_:pd.to_numeric(df_.Change_in_pct.str.replace("+","").str.replace("%","")),
            Volume          =  lambda df_:pd.to_numeric(df_.Volume.str.replace("M","")),
            Avg_Vol_Per_3M  =  lambda df_:pd.to_numeric(df_.Avg_Vol_Per_3M.str.replace("M","").str.replace(",","")),
            Market_Cap      =  lambda df_:df_.Market_Cap.apply(lambda value: float(value.replace("B","")) if "B" in value else float(value.replace("T","")) * 1000),
            PE_Ratio        =  lambda df_:pd.to_numeric(df_.PE_Ratio.replace("-",np.nan).str.replace(",",""))                                  
            )
        .rename(columns={'Price':'Price_USD',"Volume":"Volume_in_Millions","Market_Cap":"Market_Cap_in_Billions"})
        
        
       

)
#stocks_df.Market_Cap         #.Change.str.extract(r"([^0-9.])",expand= False).unique()
#stocks_df          .Price_USD.str.contains("[^0-9.]",regex= True).sum()
#stocks_df.Market_Cap.str.extract(r"([^0-9.])",expand= False).unique()
#stocks_df.PE_Ratio.str.extract(r"([^0-9.])",expand= False).unique()
stocks_df


  Change          =  lambda df_:pd.to_numeric(df_.Change.str.replace("+","")) ,        # lets remove + sign from the change column as it is redundant
  Change_in_pct   =  lambda df_:pd.to_numeric(df_.Change_in_pct.str.replace("+","").str.replace("%","")),


Unnamed: 0,Company_Name,Symbol,Price_USD,Change,Change_in_pct,Volume_in_Millions,Avg_Vol_Per_3M,Market_Cap_in_Billions,PE_Ratio,Price_USD.1
0,Walmart Inc.,WMT,109.67,-1.76,-1.58,227.836,278.992,2676.000,37.30,109.67
1,Ford Motor Company,F,9.72,-0.18,-1.82,131.420,98.880,38.652,6.66,9.72
2,"Tesla, Inc.",TSLA,263.55,-9.58,-3.51,122.382,95.897,847.714,129.83,263.55
3,Palantir Technologies Inc.,PLTR,85.85,-4.24,-4.71,90.381,98.076,201.349,451.84,85.85
4,NIO Inc.,NIO,3.7500,-0.23,-5.78,78.400,57.294,8.132,,3.75
...,...,...,...,...,...,...,...,...,...,...
237,"The Interpublic Group of Companies, Inc.",IPG,26.29,-0.11,-0.42,5.080,6.747,9.797,14.37,26.29
238,Full Truck Alliance Co. Ltd.,YMM,12.90,-0.38,-2.86,5.049,7.728,13.490,31.46,12.90
239,"Moderna, Inc.",MRNA,31.12,-1.00,-3.11,5.679,11.432,12.032,,31.12
240,"Conagra Brands, Inc.",CAG,26.55,0.09,0.34,5.048,5.917,12.673,25.78,26.55
