### Scraping Political News on investing.com
##### Proceed in several rounds (indicated by R) for optimal use and cleaning

In [None]:
import pandas as pd

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

%load_ext autoreload
%autoreload 2

from selenium.webdriver.chrome.options import Options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-dev-shm-usage')        
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('disable-infobars')

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import pandas as pd

# Chrome options for headless mode
options = Options()
options.headless = True 

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # Use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists- Repeated ads

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
    
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
    
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
    
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Headlines']

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data1.xlsx', index=False)

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/100'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Headlines']

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data2.xlsx', index=False)

##### R3

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/200'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Headlines']

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data3.xlsx', index=False)

##### R4

In [None]:
# Chrome options for headless mode
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/300'
browser.get(url1)

data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data4.xlsx', index=False)

##### R5

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/400'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data5.xlsx', index=False)

##### R6

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True 
# Specify path to chromedriver and set up the browser
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/500'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data6.xlsx', index=False)

##### R7

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  # Set to False if you want to see the browser window

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/600'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
 
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'

]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data7.xlsx', index=False)

##### R8

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/700'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data8.xlsx', index=False)

##### R9

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True 

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/800'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data9.xlsx', index=False)

##### R10

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/900'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.to_excel('scraped_data10.xlsx', index=False)

##### R11

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1000'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.to_excel('scraped_data11.xlsx', index=False)

##### R12

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  

browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1100'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                #  use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data12.xlsx', index=False)

##### R13

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1200'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5', 'By Investing.com - Feb 02, 2024 115',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stocks-attracted-20-billion-in-inflows-in-a-week-to-january-31--bofa-3291388',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data13.xlsx', index=False)

##### R14

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1300'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024",
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024'
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5', 'By Investing.com - Feb 02, 2024 115',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stocks-attracted-20-billion-in-inflows-in-a-week-to-january-31--bofa-3291388',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data14.xlsx', index=False)

##### R15

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True 
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1400'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024", 'US starts retaliatory strikes in Iraq and Syria against Iran-linked...',
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024',  ' - Feb 03, 2024',
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',  'By Reuters - Feb 03, 2024 4',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5', 'By Investing.com - Feb 02, 2024 115',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stocks-attracted-20-billion-in-inflows-in-a-week-to-january-31--bofa-3291388',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687', 'https://www.investing.com/news/world-news/us-starts-retaliatory-strikes-in-iraq-syria-officials-3291791',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data15.xlsx', index=False)

##### R16

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1500'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 100

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024", 'US starts retaliatory strikes in Iraq and Syria against Iran-linked...',
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024',  ' - Feb 03, 2024',
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',  'By Reuters - Feb 03, 2024 4',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5', 'By Investing.com - Feb 02, 2024 115',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stocks-attracted-20-billion-in-inflows-in-a-week-to-january-31--bofa-3291388',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687', 'https://www.investing.com/news/world-news/us-starts-retaliatory-strikes-in-iraq-syria-officials-3291791',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data16.xlsx', index=False)

##### R17

In [None]:
# Chrome options for headless mode 
options = Options()
options.headless = True  
browser = webdriver.Chrome(executable_path=r'C:\Users\hab021\Documents\Driver\chromedriver121.exe', options=options)

url1 = 'https://www.investing.com/news/politics/1600'
browser.get(url1)

# Data storage
data = {
    'Headlines': [],
    'Dates': [],
    'Sources': [],
    'URLs': []
}

# Initialize wait
wait = WebDriverWait(browser, 1)

# Pages 1-1730
no_of_pagedowns = 130

while no_of_pagedowns:
    try:
        # Wait for elements to load and then find them
        news_headlines = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "title")))     
        news_dates = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='date']")))
        news_sources = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@class='articleDetails']")))
        news_urls = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@id='leftColumn']//article//div/a")))
        
        # Store the data in lists
        for headline, date, source, headline in zip(news_headlines, news_dates, news_sources, news_headlines):
            data['Headlines'].append(headline.text)
            data['Dates'].append(date.text)
            data['Sources'].append(source.text)
            #data['URLs'].append(url.get_attribute('href'))
            data['URLs'].append(headline.get_attribute('href'))

        # Check for overlay and attempt to close it before clicking 'Next'
        try:
            overlay = browser.find_element(By.ID, "transparentInner")
            if overlay:
                # use JavaScript to ensure the 'Next' button is clicked
                next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
                browser.execute_script("arguments[0].click();", next_button)
        except TimeoutException:
            # If the overlay is not found, proceed with clicking the 'Next' button
            next_button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            next_button.click()

    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Break the loop in case of an error

    no_of_pagedowns -= 1

# Quit the browser session
browser.quit()

# Filter out empty strings or strings that only contain whitespace
data['Headlines'] = [headline for headline in data['Headlines'] if headline.strip()]
data['Dates'] = [date for date in data['Dates'] if date.strip()]
data['Sources'] = [source for source in data['Sources'] if source.strip()]
data['URLs'] = [url for url in data['URLs'] if url.strip()]

In [None]:
# Example substrings that we want to remove from the lists

unwanted_headlines_substrings = [
    "2024 Financial Market Outlook by Octa",
    "Stock Market Today: S&P 500 clinches record high as Meta's mega rally...",
    'Oil posts weekly losses as US data dents hopes for near-term rate cuts',
    "Payrolls, mega-cap tech earnings, Nvidia - what's moving markets",
    "Stocks attracted $20 billion in inflows in a week to January 31 - BofA",
    "Apple shares slip after iPhone sales miss estimates amid China...",
    'U.S. economy adds 353,000 jobs in January',
     'US stocks rally on blowout jobs report, Meta-led gains in big tech',
     'Buy any undue correction in quality AI leaders says UBS',
     'Oil falls as US jobs data dents hope for near-term rate cuts',
    "Buy any undue correction in quality AI leaders says UBS",
    "U.S. economy adds 353,000 jobs in January",
    "Amazon top Q4 estimates as e-commerce shines, sending shares higher",
    "'Patek Philippe of Internet': Analysts heap praise on Meta' Platforms...",
    "Here Are the Winners of The UF AWARDS MEA 2024", 'US starts retaliatory strikes in Iraq and Syria against Iran-linked...',
    "Oil falls as US jobs data dents hope for near-term rate cuts",
    "FXStarterKit by Forexware: A Solution Built for Global Expansion",
    "UF Agency: Pioneering Strategic Fintech Marketing for Optimal Reach",
    "Oil set for weekly loss despite gain on OPEC+ output decision"
]

unwanted_dates_substrings = [
    ' - Feb 02, 2024',  ' - Feb 03, 2024',
]

unwanted_sources_substrings = [
    'By Investing.com Studios',
    'By Investing.com - Feb 02, 2024 103',
    'By Investing.com - Feb 02, 2024 3',  'By Reuters - Feb 03, 2024 4',
    'By Investing.com - Feb 02, 2024 16',
    'By Investing.com - Feb 02, 2024 7',
    'By Investing.com - Feb 02, 2024 4',
    'By Investing.com - Feb 02, 2024 107',
    'By Investing.com - Feb 02, 2024 5',
    'By Investing.com - Feb 02, 2024 24', 'By Investing.com - Feb 02, 2024 114',
    'By Investing.com - Feb 02, 2024 9',
    'By Reuters - Feb 02, 2024 5', 'By Investing.com - Feb 02, 2024 115',
    'By Investing.com - Feb 02, 2024 109',
    'By Investing.com - Feb 02, 2024 6',
    'By Reuters - Feb 02, 2024 4'
]

unwanted_urls_substrings = [
    'https://www.investing.com/news/economic-indicators/us-economy-adds-353000-jobs-in-january-3291366',
    'https://www.investing.com/news/stock-market-news/stocks-attracted-20-billion-in-inflows-in-a-week-to-january-31--bofa-3291388',
    'https://www.investing.com/news/stock-market-news/stock-market-today-sp-500-clinches-record-high-as-metas-mega-rally-fuels-bulls-3290624',
     'https://www.investing.com/jp.php?v2=N3czbTViN2xmMG9lM2kxMTdkZj0zMTUxMyRmNDQ-ZSxnIWFoZDxhJ2ZuYH4ybjNpZBdjPGFpNSMyZGc1ZyZnJDdwM201ZDdsZjNvajN2MXA3a2Y_Mzw1PjMkZiU0Pg==',
    'https://www.investing.com/jp.php?v2=YyM_YW45Yjk3YWxmYTs1Nz5qNG8zNTQ2MyRgMjsxYiszdT43YzsxdzQ8PCJgPDJoYhE_YDU9MiQ2YG48NXRhImMkP2FuP2I5N2JsaWEkNXQ-YjRtMzw0PzMkYCM7MQ==',
    'https://www.investing.com/news/stock-market-news/buy-any-undue-correction-in-quality-ai-leaders-says-ubs-3291589',
    'https://www.investing.com/news/stock-market-news/apple-q1-results-top-estimates-but-iphone-sales-fall-just-shy-amid-china-weakness-3290542',
    'https://www.investing.com/news/commodities-news/oil-prices-gain-after-opec-maintains-output-cuts-3290687', 'https://www.investing.com/news/world-news/us-starts-retaliatory-strikes-in-iraq-syria-officials-3291791',
    'https://www.investing.com/analysis/2-reasons-why-february-may-be-a-difficult-month-on-wall-street-200645728',
     'https://www.investing.com/news/economy/payrolls-megacap-tech-earnings-nvidia--whats-moving-markets-3290897',
    'https://www.investing.com/jp.php?v2=NnZmODViYzhhNzowMmgxMTdkZT40OzQ3ZXJvPWRubiczdTU8MWllIzY-PiAwbGU_YBNlOj83NSM0YjdlMXBnJDZxZjg1ZGM4YTQ6PzJ3MXA3a2U8NDs0P2Vybyxkbg==',
    'https://www.investing.com/jp.php?v2=OXlmOGYxM2gxZz40Zz1hYTVmYzhmZjcyMyRmNDI4NXxjJWFoZz80cmRsPCI1aTVvM0BiPTU9Z3FlMzdlZyZgIzl-ZjhmNzNoMWQ-O2ciYSA1aWM6Zmk3PDMkZiUyOA==',
     'https://www.investing.com/jp.php?v2=ZSU3aWYxNW5kMmxmYDoyMDJmNG9jZDQ0YXZkNjE7byYwdmVsYTkzdWVtanRkODFrYhFhPmRsYHY1Y248ZyZlJmUiN2lmNzVuZDFsaWAlMnMybjRtY2w0P2F2ZCcxOw==',
    'https://www.investing.com/news/stock-market-news/dow-futures-tick-higher-as-tech-titans-report-nonfarm-payrolls-loom-3290624'
]

In [None]:
data['Headlines'] = [headline for headline in data['Headlines']
                     if not any(unwanted_substring in headline for unwanted_substring in unwanted_headlines_substrings)]

In [None]:
len(data['Headlines'])

In [None]:
data['Dates'] = [date for date in data['Dates']
                     if not any(unwanted_substring in date for unwanted_substring in unwanted_dates_substrings)]

In [None]:
len(data['Dates'])

In [None]:
data['Sources'] = [source for source in data['Sources']
                     if not any(unwanted_substring in source for unwanted_substring in unwanted_sources_substrings)]

In [None]:
len(data['Sources'])

In [None]:
data['URLs'] = [url for url in data['URLs']
                     if not any(unwanted_substring in url for unwanted_substring in unwanted_urls_substrings)]

In [None]:
# Filter out URLs that end with '#comments'
data['URLs'] = [url for url in data['URLs'] if not url.endswith('#comments')]
print(f"Cleaned URLs count: {len(data['URLs'])}")

In [None]:
# Define the prefix to filter out
unwanted_prefix = 'https://www.investing.com/jp.php?'

# Filter the list to exclude URLs starting with the unwanted prefix
data['URLs'] = [url for url in data['URLs'] if not url.startswith(unwanted_prefix)]

print(f"Filtered URLs count: {len(data['URLs'])}")

In [None]:
# Remove duplicates while preserving order
unique_urls = []
seen = set()
for url in data['URLs']:
    if url not in seen:
        unique_urls.append(url)
        seen.add(url)

# Update the data['URLs'] with the list of unique URLs
data['URLs'] = unique_urls
print(f"Unique URLs count: {len(data['URLs'])}")

In [None]:
df = pd.DataFrame(data)

In [None]:
df.to_excel('scraped_data17.xlsx', index=False)