<a href="https://colab.research.google.com/github/Luccarodriguezk/Webscraping-project/blob/main/Web_scraping_project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [2]:
url = 'https://finance.yahoo.com/quote/AAPL/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser') #creates a beautifulsoup object, a parser (data structure) with the html content
results = soup.find('Volume')


In [3]:
page_title = soup.title.string
print(f"Page Title: {page_title}")

Page Title: Apple Inc. (AAPL) Stock Price, News, Quote & History - Yahoo Finance


In [4]:
headlines = soup.find_all('h2', class_='headline')
print("\nHeadlines:")
for headline in headlines:
    print(headline.text.strip())


Headlines:


In [5]:
def scrape_stock_volume(ticker):
    # Step 1: Set up the URL and send a GET request
    url = f"https://finance.yahoo.com/quote/{ticker}/"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)

    # Step 2: Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Step 3: Find the volume data
    volume_element = soup.find('fin-streamer', {'data-test': 'TD_VOLUME-value'})

    # Step 4: Extract and process the volume data
    if volume_element:
        volume_text = volume_element.text.strip()
        volume = int(re.sub(r'[^\d]', '', volume_text))
        return volume
    else:
        return "Volume data not found"

# Step 5: Use the function to get AAPL stock volume
aapl_volume = scrape_stock_volume('AAPL')
print(f"AAPL Stock Volume: {aapl_volume}")

AAPL Stock Volume: Volume data not found


In [6]:
def scrape_stock_volume(ticker):
    url = f"https://finance.yahoo.com/quote/{ticker}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.text, 'html.parser')

        # Look for the script tag containing the stock data
        script = soup.find('script', text=re.compile('root.App.main'))

        if script:
            json_text = re.search(r'root.App.main\s*=\s*({.*?});', script.string, re.DOTALL).group(1)
            data = json.loads(json_text)

            # Navigate through the JSON structure to find the volume
            try:
                volume = data['context']['dispatcher']['stores']['QuoteSummaryStore']['summaryDetail']['volume']['raw']
                return int(volume)
            except KeyError:
                return "Volume data not found in JSON structure"
        else:
            return "Script containing stock data not found"

    except requests.RequestException as e:
        return f"Error fetching data: {str(e)}"
    except json.JSONDecodeError:
        return "Error parsing JSON data"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

# Use the function to get AAPL stock volume
aapl_volume = scrape_stock_volume('AAPL')
print(f"AAPL Stock Volume: {aapl_volume}")

AAPL Stock Volume: Script containing stock data not found


  script = soup.find('script', text=re.compile('root.App.main'))


In [7]:
# lets try an easier website
url1 = 'https://forecast.weather.gov/MapClick.php?CityName=Cheney&state=WA&site=OTX&textField1=47.4875&textField2=-117.575&e=0'
response1 = requests.get(url1)
soup1 = BeautifulSoup(response1.content,'html.parser')
#lets get the title
Title = soup1.title.string
print(f"Title: {Title}")


Title: National Weather Service


In [8]:
def scrape_humidity(url):
  response = requests.get(url)

  #create a chain of conditions to perform the test
  if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser') #parse data
    Humidity_element = soup.find('td', string='Humidity') #find the data
    if Humidity_element:
      Humidity_data = Humidity_element.find_next_sibling('td').text.strip()
      return Humidity_data
    else:
      return "Humidity information not found"
  else:
    return f"Failed to retrieve the webpage. Status code: {response.status_code}"

url = 'https://forecast.weather.gov/MapClick.php?CityName=Cheney&state=WA&site=OTX&textField1=47.4875&textField2=-117.575&e=0'
Humidity = scrape_humidity(url)
print(f'current Humidity {Humidity}')

current Humidity 44%


In [9]:

def scrape_dewpoint(url):
    # Send a GET request to the webpage
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the dewpoint information
        dewpoint_element = soup.find('td', string='Dewpoint')
        if dewpoint_element:
            dewpoint = dewpoint_element.find_next_sibling('td').text.strip()
            return dewpoint
        else:
            return "Dewpoint information not found"
    else:
        return f"Failed to retrieve the webpage. Status code: {response.status_code}"

# URL of the weather page
url = "https://forecast.weather.gov/MapClick.php?CityName=Cheney&state=WA&site=OTX&textField1=47.4875&textField2=-117.575&e=0"

# Scrape the dewpoint
dewpoint = scrape_dewpoint(url)
print(f"Current Dewpoint: {dewpoint}")

Current Dewpoint: 32°F (0°C)


In [10]:
'''
def get_first_search_result(query):
    # Format the query string for a search
    query = query.replace(' ', '+')
    a_url = f"https://www.google.com/search?q={query}"

    # Send a GET request to DuckDuckGo
    response = requests.get(a_url)
    if response.status_code == 200:
        # Parse the search result page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the first search result link
        result = soup.find('a', {'class': 'result__a'})
        if result:
            return result['href']
        else:
            return "No search results found."
    else:
        return f"Failed to retrieve search results. Status code: {response.status_code}"

# Example usage
query = "National+Weather+Service+cheney"
url = get_first_search_result(query)
print(f"First search result: {url}")'''

'\ndef get_first_search_result(query):\n    # Format the query string for a search\n    query = query.replace(\' \', \'+\')\n    a_url = f"https://www.google.com/search?q={query}"\n\n    # Send a GET request to DuckDuckGo\n    response = requests.get(a_url)\n    if response.status_code == 200:\n        # Parse the search result page\n        soup = BeautifulSoup(response.content, \'html.parser\')\n\n        # Find the first search result link\n        result = soup.find(\'a\', {\'class\': \'result__a\'})\n        if result:\n            return result[\'href\']\n        else:\n            return "No search results found."\n    else:\n        return f"Failed to retrieve search results. Status code: {response.status_code}"\n\n# Example usage\nquery = "National+Weather+Service+cheney"\nurl = get_first_search_result(query)\nprint(f"First search result: {url}")'

In [11]:
import requests
from bs4 import BeautifulSoup

def get_first_search_result(query):
    # Format the query string for a search
    query = query.replace(' ', '+')
    url = f"https://www.google.com/search?q={query}"

    # Send a GET request to Google
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        # Parse the search result page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the first search result link
        result = soup.find('div', class_='yuRUbf')
        if result:
            link = result.find('a')
            if link:
                return link['href']

        return "No search results found."
    else:
        return f"Failed to retrieve search results. Status code: {response.status_code}"

# Example usage
query = "National Weather Service cheney"
url = get_first_search_result(query)
print(f"First search result: {url}")


First search result: https://forecast.weather.gov/zipcity.php?inputstring=Cheney,WA


In [14]:
_#searching for stuff in a table of data
Elements = ['Humidity','Dewpoint', 'Wind Speed', 'Visibility']
response = requests.get(url)
if response.status_code == 200:    # Check if the request was successful
# Parse the HTML content

  def Data_in_cheney(url):
    soup = BeautifulSoup(response.content, 'html.parser')
    output = []
    for i in range(len(Elements)):

          element = soup.find('td', string=f'{Elements[i]}')
          if element:
              data = element.find_next_sibling('td').string.strip()
              output.append(data)
          else:
              return f"{Elements[i]} information not found"
    return output

else:
  print( f"Failed to retrieve the webpage. Status code: {response.status_code}")

# URL of the weather page

# Scrape the dewpoint
charcacteristc = Data_in_cheney(url)
for i in range(len(Elements)):
    print(f"{Elements[i]}: {str(charcacteristc[i])}")




Humidity: 44%
Dewpoint: 32°F (0°C)
Wind Speed: E 3 MPH
Visibility: NA
