In [46]:
# Initiation and authorization

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
options = webdriver.ChromeOptions()

# Create a new Chrome session
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

try:
    # Step 1: Log in to the university portal
    university_login_url = 'https://uaccess.univie.ac.at'
    driver.get(university_login_url)
    time.sleep(1)  # Wait for the page to load

    # Enter university login credentials
    driver.find_element(By.ID, 'userid').send_keys('')                                                                     
    driver.find_element(By.ID, 'password').send_keys('')                                                                 
    driver.find_element(By.NAME, '_eventId_proceed').click()                                                                        
    time.sleep(1)  # Wait for the page to load

    # Step 2: Log in to CapitalIQ
    capitaliq_login_url = 'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/web/client?auth=inherit#dashboard/sfi'
    driver.get(capitaliq_login_url)
    time.sleep(7)  # Wait for the page to load

    # Enter email address and navigate to the next step
    driver.find_element(By.ID, 'input28').send_keys('')                                                
    driver.find_element(By.XPATH, "//input[@type='submit' and @value='Next']").click()                                             
    time.sleep(2)  # Wait for the page to load

    # Enter password and navigate to the next step
    driver.find_element(By.ID, 'input59').send_keys('')                                                                 
    driver.find_element(By.XPATH, "//input[@type='submit' and @value='Sign In']").click()                                      
    time.sleep(2)  # Wait for the page to load
        
    # Step 3: Navigate to the desired page
    final_url = 'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/web/client?auth=inherit#dashboard/sfi'
    driver.get(final_url)
    time.sleep(2)  # Wait for the page to load

finally:
    1


In [41]:
# Handle cookie consent if it appears

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def accept_cookies(waittime = 5):
    try:
        accept_cookies = WebDriverWait(driver, waittime).until(
             EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
        time.sleep(2)
        accept_cookies.click()
    except:
        print("Cookie consent button not found or already accepted")

In [47]:
# Load company datasets

import pandas as pd

# Read the Excel files into DataFrames
companies_50_perc_df = pd.read_excel("companies_50_perc.xlsx", engine='openpyxl')
companies_df = pd.read_excel("companies.xlsx", engine='openpyxl')

# Perform an antijoin to exclude rows in companies_50_perc_df that are present in companies_df based on "Ticker"
# Merge with an indicator and keep only columns from companies_50_perc_df
antijoin_df = companies_50_perc_df.merge(companies_df[['Ticker']], on='Ticker', how='left', indicator=True)
antijoin_df = antijoin_df[antijoin_df['_merge'] == 'left_only'].drop(columns=['_merge'])

# Reset the index to make it increment by 1 from 0
antijoin_df = antijoin_df.reset_index(drop=True)

# Display the result
antijoin_df


Unnamed: 0,Entity Name,Entity ID,Ticker,Country / Region Name,Market Capitalization\n($M),Market Capitalization\n($M).1,Market Capitalization\n($M).2
0,Altech Batteries Limited (ASX:ATC),4348333,ATC,Australia,82.614729,63.248038,49.99
1,Wallbridge Mining Company Limited (TSX:WM),4355273,WM,Canada,76.951874,63.219950,49.98
2,"Lee Enterprises, Incorporated (NASDAQGS:LEE)",4121672,LEE,USA,64.934730,63.028401,49.94
3,Southern Silver Exploration Corp. (TSXV:SSV),4354471,SSV,Canada,45.147384,62.988739,49.93
4,New World Resources Limited (ASX:NWC),4353632,NWC,Australia,50.447730,62.907763,49.90
...,...,...,...,...,...,...,...
1465,Kula Gold Limited (ASX:KGD),4361895,KGD,Australia,4.287994,4.756525,19.71
1466,EV Resources Limited (ASX:EVR),4355218,EVR,Australia,8.098365,4.441249,19.15
1467,Helix Resources Limited (ASX:HLX),4350997,HLX,Australia,7.730935,4.388832,19.04
1468,Castillo Copper Limited (ASX:CCZ),4363973,CCZ,Australia,6.054268,4.368085,19.00


In [49]:
# The function to iterage over the companies
# (gets us links to all news articles for every company)

# All libraries above (I'm NOT figuring out, which are irrelevant here)
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import math
import pandas as pd
import json



def get_news_links(companies): 
    
    companies = companies.astype("str")
    
    companies_with_news = []
    comps_and_links = {}

    k = 0
    
    for company in companies:

        k = k + 1
        print("\n", "[", str(k), "/", len(companies), "]")
        
        url = "https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/web/client#company/briefingbooknews?ID=" + company
        links_list = []
        
        try:
            # Open the initial URL where the pagination is located
            driver.get(url)
            accept_cookies(4) # already includes a waiting period
            #time.sleep(6)  # Wait for the page to load
            
            # Locate the element containing the total number of records
            records_label = driver.find_element(By.CSS_SELECTOR, ".ui-iggrid-pagerrecordslabel.ui-iggrid-results")
            total_records_text = records_label.text
            print(total_records_text)
            total_records = int(total_records_text.split('of ')[1].split(' ')[0].replace(',', ''))
            
            # Calculate the total number of pages (20 records per page)
            records_per_page = 20
            total_pages = math.ceil(total_records / records_per_page)
            
            print(f"Total pages: {total_pages}")
            
            for page in range(1, total_pages + 1):
                print(f"Processing page {page}")
                
                # Locate the section containing the links
                section = driver.find_element(By.CSS_SELECTOR, "tbody.ui-widget-content.ui-iggrid-tablebody")
            
                # Find all <a> tags within the section
                links = section.find_elements(By.TAG_NAME, 'a')
            
                # Create a list to store the href attributes of the links
                links_list_this_page = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]
                links_list = links_list + links_list_this_page
        
                # Append comps_and_links
                if links_list:
                    companies_with_news.append(company)
                    
        
                # If not the last page, click the "Next" button to go to the next page
                if page < total_pages:
                    next_button = driver.find_element(By.XPATH, "//div[contains(@id, 'section_1_control_') and contains(@id, '_grid_table_next_page')]")
                    next_button.click()
                    time.sleep(4)  # Wait for the next page to load
           
        
            comps_and_links[company] = links_list
        
            # Print the list of links
            print(company, "- News link clicked - ", len(links_list))
    
        except:
            print(company, "- News link not found")




    return companies_with_news, comps_and_links

In [50]:
# Check how it works:

comps, links_dict = get_news_links(antijoin_df["Entity ID"])


 [ 1 / 1470 ]
Cookie consent button not found or already accepted
4348333 - News link not found

 [ 2 / 1470 ]
Cookie consent button not found or already accepted
4355273 - News link not found

 [ 3 / 1470 ]
Cookie consent button not found or already accepted
4121672 - News link not found

 [ 4 / 1470 ]
Cookie consent button not found or already accepted
4354471 - News link not found

 [ 5 / 1470 ]
Cookie consent button not found or already accepted
4353632 - News link not found

 [ 6 / 1470 ]
Cookie consent button not found or already accepted
4772846 - News link not found

 [ 7 / 1470 ]
Cookie consent button not found or already accepted
4551885 - News link not found

 [ 8 / 1470 ]
Cookie consent button not found or already accepted
4536824 - News link not found

 [ 9 / 1470 ]
1 - 1 of 1 records
Total pages: 1
Processing page 1
4998306 - News link clicked -  1

 [ 10 / 1470 ]
Cookie consent button not found or already accepted
1 - 1 of 1 records
Total pages: 1
Processing page 1
4349

In [58]:
# Check:
comps

['4998306',
 '4349418',
 '6613555',
 '4347921',
 '4097201',
 '100877',
 '4215589',
 '11269319',
 '4351032',
 '4143805',
 '4352852',
 '4307943',
 '4307943',
 '4185123',
 '1032865',
 '4346670',
 '4206027',
 '100034886',
 '4121865',
 '4157568',
 '102392',
 '4121781',
 '4008572',
 '102387',
 '7081574',
 '7081574',
 '1023585',
 '4349838',
 '4823420',
 '1017974',
 '4348788',
 '4293719',
 '4293719',
 '4293719',
 '4412401',
 '4095422',
 '1024331',
 '100607',
 '6616460',
 '4393570',
 '4393570',
 '4393570',
 '4348308',
 '4308425',
 '4307754',
 '4686138',
 '1020145',
 '4428746',
 '100669',
 '4376963',
 '4376963',
 '4376963',
 '4376963',
 '4376963',
 '1013355',
 '4352512',
 '4090994',
 '4348501',
 '4659472',
 '4543503',
 '4543503',
 '4773080',
 '4773080',
 '4593351',
 '4096947',
 '4354229',
 '4550101',
 '28081274',
 '28035576',
 '4282019',
 '1022254',
 '4057257',
 '4214351',
 '4159374',
 '1004669',
 '1013751',
 '4348058',
 '4360334',
 '8543755',
 '4350039',
 '4143415',
 '4057736',
 '4620258',
 '41

In [59]:
# Check:
links_dict

{'4998306': ['https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=76527571&KeyProductLinkType=2'],
 '4349418': ['https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=76640167&KeyProductLinkType=2'],
 '6613555': ['https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=81268460&KeyProductLinkType=2',
  'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=80670301&KeyProductLinkType=2',
  'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=78446050&KeyProductLinkType=2',
  'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=78417891&KeyProductLinkType=2',
  'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=77726499&KeyProductLinkType=2',
  'https://www-capit

In [54]:
# save the companies and links to articles as a json file 
import json

file = open("comps_and_links_updated.json", "w")
json.dump(links_dict, file)
file.close()

In [55]:
# check if it saved correctly 
file2 = open("comps_and_links_updated.json", "r")
data = json.load(file2)
file2.close()

print(data)

{'4998306': ['https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=76527571&KeyProductLinkType=2'], '4349418': ['https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=76640167&KeyProductLinkType=2'], '6613555': ['https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=81268460&KeyProductLinkType=2', 'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=80670301&KeyProductLinkType=2', 'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=78446050&KeyProductLinkType=2', 'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=78417891&KeyProductLinkType=2', 'https://www-capitaliq-spglobal-com.uaccess.univie.ac.at/apisv3/spg-webplatform-core/news/article?Id=77726499&KeyProductLinkType=2', 'https://www-capitaliq-spgloba

In [57]:
#Check the number of links
for key, links in data.items():
    print(f"{key} - Number of links: {len(links)}")

4998306 - Number of links: 1
4349418 - Number of links: 1
6613555 - Number of links: 6
4347921 - Number of links: 14
4097201 - Number of links: 2
100877 - Number of links: 17
4215589 - Number of links: 1
11269319 - Number of links: 12
4351032 - Number of links: 4
4143805 - Number of links: 16
4352852 - Number of links: 2
4307943 - Number of links: 29
4185123 - Number of links: 1
1032865 - Number of links: 2
4346670 - Number of links: 1
4206027 - Number of links: 5
100034886 - Number of links: 14
4121865 - Number of links: 9
4157568 - Number of links: 1
102392 - Number of links: 1
4121781 - Number of links: 1
4008572 - Number of links: 3
102387 - Number of links: 3
7081574 - Number of links: 23
1023585 - Number of links: 3
4349838 - Number of links: 1
4823420 - Number of links: 11
1017974 - Number of links: 6
4348788 - Number of links: 2
4293719 - Number of links: 43
4412401 - Number of links: 6
4095422 - Number of links: 8
1024331 - Number of links: 4
100607 - Number of links: 2
661646

In [62]:
# NOW, we need to get the articles themselves:
# This time I will start with a function from the get-go
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re

# Dictionary of company codes and URLs
companies = links_dict



# Function to find the content area using multiple selectors
def find_content_area(driver):
    selectors = [
        {'by': By.CSS_SELECTOR, 'value': 'div[data-testid="newsArticleDJ"]'},
        {'by': By.ID, 'value': 'lazy-article-div'}
    ]
    for selector in selectors:
        try:
            element = driver.find_element(selector['by'], selector['value'])
            if element:
                return element
        except:
            continue
    return None  # Return None if no content area is found
    
# Function to extract article text using multiple possible selectors
def extract_article_text(driver):
    selectors = [
        {'by': By.CSS_SELECTOR, 'value': 'div[data-testid="ArticleContent_0"]'},
        {'by': By.CSS_SELECTOR, 'value': 'div[id="newsArticleMI"]'}
    ]
    for selector in selectors:
        try:
            element = driver.find_element(selector['by'], selector['value'])
            if element:
                return element.text
        except:
            continue
    return ""  # Return empty string if no article content is found

# Regular expression pattern for dates
date_pattern = re.compile(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday), \b(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4} \d{1,2}:\d{2} (AM|PM) ET')

# Function to extract the first matching date
def extract_date(content_area):
    texts = content_area.find_elements(By.XPATH, ".//*[contains(@class, 'spg-text-secondary')]")
    for text in texts:
        match = date_pattern.search(text.text)
        if match:
            return match.group()
    return ""  # Return empty string if no date is found

# Iterate over companies and their URLs
def database_composition(companies):
    # Initialize a list to store data
    data = []
    
    total_links = sum(len(links) for links in companies.values())
    k = 0
    
    for company_code, urls in companies.items():
        for url in urls:

            k = k + 1
            print(str(k), "/", str(total_links))
            
            driver.get(url)
            time.sleep(4)  # Wait for the page to load completely
    
            # Attempt to find the content area
            content_area = find_content_area(driver)
            if not content_area:
                print("bad content area for:   ", url)
                continue  # Skip this URL if the content area is not found
    
            # Extract all titles and concatenate them within the content area
            titles = content_area.find_elements(By.CLASS_NAME, "ignore-for-localizejs")
            title_text = "; ".join(title.text for title in titles)
    
            
    
            # Extract the first publication date
            publication_date = extract_date(content_area)
            if publication_date == "":
                print("bad date for:   ", url)
                
            # Extract the article text
            article_text = extract_article_text(driver)
    
            # Append the extracted data to the list
            data.append([company_code, url, title_text, publication_date, article_text])
    
    # Create a DataFrame
    final_data = pd.DataFrame(data, columns=["company_code", "url", "title", "publication_date", "article"])
    
    # Save the DataFrame to CSV
    final_data.to_csv("company_articles_updated.csv", index=False)
    
    # Close the WebDriver
    #driver.quit()
    
    print("Data extraction and saving complete.")

In [63]:
# Making of a databse

database_composition(data)

1 / 4310
2 / 4310
3 / 4310
4 / 4310
5 / 4310
6 / 4310
7 / 4310
8 / 4310
9 / 4310
10 / 4310
11 / 4310
12 / 4310
13 / 4310
14 / 4310
15 / 4310
16 / 4310
17 / 4310
18 / 4310
19 / 4310
20 / 4310
21 / 4310
22 / 4310
23 / 4310
24 / 4310
25 / 4310
26 / 4310
27 / 4310
28 / 4310
29 / 4310
30 / 4310
31 / 4310
32 / 4310
33 / 4310
34 / 4310
35 / 4310
36 / 4310
37 / 4310
38 / 4310
39 / 4310
40 / 4310
41 / 4310
42 / 4310
43 / 4310
44 / 4310
45 / 4310
46 / 4310
47 / 4310
48 / 4310
49 / 4310
50 / 4310
51 / 4310
52 / 4310
53 / 4310
54 / 4310
55 / 4310
56 / 4310
57 / 4310
58 / 4310
59 / 4310
60 / 4310
61 / 4310
62 / 4310
63 / 4310
64 / 4310
65 / 4310
66 / 4310
67 / 4310
68 / 4310
69 / 4310
70 / 4310
71 / 4310
72 / 4310
73 / 4310
74 / 4310
75 / 4310
76 / 4310
77 / 4310
78 / 4310
79 / 4310
80 / 4310
81 / 4310
82 / 4310
83 / 4310
84 / 4310
85 / 4310
86 / 4310
87 / 4310
88 / 4310
89 / 4310
90 / 4310
91 / 4310
92 / 4310
93 / 4310
94 / 4310
95 / 4310
96 / 4310
97 / 4310
98 / 4310
99 / 4310
100 / 4310
101 / 43

In [64]:
# Load the data 
final_data = pd.read_csv("company_articles_updated.csv")
final_data

Unnamed: 0,company_code,url,title,publication_date,article
0,4998306,https://www-capitaliq-spglobal-com.uaccess.uni...,European banks' capital offerings rebound to b...,"Wednesday, July 19, 2023 5:48 AM ET",Capital offerings by banks in Europe recovered...
1,4349418,https://www-capitaliq-spglobal-com.uaccess.uni...,Condor Gold Says It Has Received Offers for Ni...,"Friday, July 21, 2023 4:02 AM ET",By Christian Moess Laursen\nCondor Gold said F...
2,6613555,https://www-capitaliq-spglobal-com.uaccess.uni...,*Calidus Resources Price Target Raised 3.6% to...,"Wednesday, April 17, 2024 11:05 PM ET","(END) Dow Jones Newswires\nApril 17, 2024 23:0..."
3,6613555,https://www-capitaliq-spglobal-com.uaccess.uni...,*Calidus Resources Price Target Cut 10% to A$0...,"Thursday, February 29, 2024 4:12 PM ET","(END) Dow Jones Newswires\nFebruary 29, 2024 1..."
4,6613555,https://www-capitaliq-spglobal-com.uaccess.uni...,*Calidus Resources Upgraded to Speculative Buy...,"Wednesday, November 15, 2023 12:48 AM ET","(END) Dow Jones Newswires\nNovember 15, 2023 0..."
...,...,...,...,...,...
4300,4349065,https://www-capitaliq-spglobal-com.uaccess.uni...,"UK Growth Is a Headache for the BOE; Friday, A...","Friday, August 11, 2023 6:31 AM ET",UK Growth Is a Headache for the BOE\n0850 GM...
4301,4349065,https://www-capitaliq-spglobal-com.uaccess.uni...,FTSE 100 Falls On Trader Caution After Strong ...,"Friday, August 11, 2023 4:44 AM ET",FTSE 100 Falls On Trader Caution After Stron...
4302,4349065,https://www-capitaliq-spglobal-com.uaccess.uni...,FTSE 100 Seen Opening Lower as Traders Weigh U...,"Friday, August 11, 2023 2:44 AM ET",FTSE 100 Seen Opening Lower as Traders Weigh...
4303,4349065,https://www-capitaliq-spglobal-com.uaccess.uni...,"Chaarat Gold Holdings' Kapan Production Falls,...","Friday, August 11, 2023 2:23 AM ET",By Michael Susin\nChaarat Gold Holdings has ...
