In [99]:
import requests
import re
from bs4 import BeautifulSoup
import spacy
import pandas as pd
import time
import json

In [100]:
companies_full_names = {
    "Enviria": "ENVIRIA Energy Holding GmbH",
    "Enrego": "ENREGO Energy GmbH",
    "HIH Invest": "HIH Invest Real Estate Austria GmbH",
    "Merkle": "Merkle Germany GmbH" 
}

In [105]:
companies = {
    "Enviria": "https://enviria.energy/en/blog",
    "Enrego": "https://enrego.de/en/about/",
    "HIH Invest": "https://hih.de/en/media/press-release/",
    "Merkle": "https://www.merkle.com/en/about-us/industry-expertise/financial-services.html" 
}

In [106]:
comp_df = pd.DataFrame(companies.items(), columns=["company", "website"])

### Company website parsing

In [107]:
#Add column to df
comp_df["invests in solar"] = None

#Load model
model = spacy.load('en_core_web_sm')

for item in companies:
    # Fetch the webpage
    response = requests.get(companies[item])
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Convert the entire webpage text to lowercase for case-insensitive search
        webpage_text = soup.get_text().lower()
        # Process the text with spaCy
        doc = model(webpage_text)
        
        # Define keywords and phrases indicating solar park investments
        keywords = ["solar park", "solar energy", "solar power", "solar investment", "solar park investment", "solarpark", "solarpark investment"]
        
        # Search for the keywords in the processed text
        invests_in_solar_park = any(phrase for phrase in keywords if phrase in doc.text.lower())
        
        if invests_in_solar_park:
            comp_df.loc[comp_df["company"] == item, "invests in solar"] = True
            print(item + ": The company invests in solar parks.")
            
        else:
            comp_df.loc[comp_df["company"] == item, "invests in solar"] = False
            print(item + ": No clear indication of investment in solar parks found.")
    else:
        
        print("Failed to fetch the webpage.")

Enviria: The company invests in solar parks.
Enrego: The company invests in solar parks.
HIH Invest: No clear indication of investment in solar parks found.
Merkle: No clear indication of investment in solar parks found.


In [108]:
comp_df.to_csv("results/company_site_results.csv", index=False)

### Google News HTML Scraping
* Google returns different results when requests used.
* I even tried to replicate my cookies in my requests but nothing worked
* So I come up with a different solution: Selenium

In [61]:
# Function to scrape news articles from Google News
def scrape_google_news(company):
    search_urls = [f'https://www.google.com/search?q="{company}"+solar&tbm=nws', 
                   f'https://www.google.com/search?q="{company}"+solar+investment&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solarpark&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+park&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+park+investment&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+energy&tbm=nws'
                   f'https://www.google.com/search?q="{company}"+solar+energy+investment&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+power&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+power+investment&tbm=nws'
                   ]
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
        }
    articles = []
    for search_url in search_urls:
        print(search_url)
        time.sleep(5)  # Wait for 5 seconds before making the next request
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        for item in soup.find_all('div', attrs={'class': "MjjYud"}):
            print("adsad")
            title = item.find('div', attrs={'class': 'n0jPhd ynAwRc MBeuO nDgy9d'}).get_text()
            desc = item.find('div', attrs={'class': 'GI74Re nDgy9d'}).get_text()
            link = item.find('a', href=True)['href']
            articles.append({"title": title, "desc": desc, "link": link})
            print(f"Title: {title}, Description: {desc}, Link: {link}")
    return articles

# Gather articles for each company
articles = {}
for company in companies:
    articles[company] = scrape_google_news(company)

# Print out the gathered articles for inspection
for company, arts in articles.items():
    print(f"Articles for {company}:")
    for art in arts:
        print(f"Title: {art['title']}, Link: {art['link']}")

https://www.google.com/search?q=Enviria+solar&tbm=nws
Articles for Enviria:


## Selenium Solution

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException


def start_driver():
    # Set the path to the chromedriver executable
    chromedriver_path = '/usr/local/bin/chromedriver'

    # Start the WebDriver and load the page
    service = Service(chromedriver_path)

    # Start the WebDriver
    driver = webdriver.Chrome(service=service)
    
    # Handle cookie consent
    driver.get('https://www.google.com')

    try:
        # Wait for the cookie consent button to be clickable
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="L2AGLb"]/div'))).click()
        print("Cookie consent accepted.")
    except Exception as e:
        print("Error accepting cookies:", e)

    return driver 

    

def close_driver(driver):
    driver.quit()

def scrape_with_selenium(company, driver, max_pages):
    # Define the search URLs
    search_urls = [f'https://www.google.com/search?q="{company}"+solar&tbm=nws', 
                   f'https://www.google.com/search?q="{company}"+solar+investment&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solarpark&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+park&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+park+investment&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+energy&tbm=nws'
                   f'https://www.google.com/search?q="{company}"+solar+energy+investment&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+power&tbm=nws',
                   f'https://www.google.com/search?q="{company}"+solar+power+investment&tbm=nws'
                   ]
    articles = []
    print(f"Scraping articles for {company}...")
    # Loop through the search URLs
    for search_url in search_urls:
        driver.get(search_url)
        current_page = 0
        # Loop through the search results pages
        while current_page < max_pages:
            # Wait for the search results to load
            parsed_articles = driver.find_elements(By.CSS_SELECTOR, 'div#rso > div >div>div>div')
            # Parse the articles on the current page
            for article_div in parsed_articles:
                try:
                    # Extract the article link
                    link = article_div.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    # Extract the article details
                    items = article_div.find_elements(By.CSS_SELECTOR, 'a>div>div>div')
                    article = [item.text for item in items]
                    # Check if the article has all the required details
                    if len(article) >= 5:
                        articles.append({"source": article[1], "title": article[2], "desc": article[3], "date": article[4], "link": link})
                except Exception as e:
                    print("Error parsing article:", e)

            # Try to find and click the "Next" button to go to the next page
            try:
                next_button = driver.find_element(By.XPATH, '//*[@id="pnnext"]')
                if next_button:
                    next_button.click()
                    current_page += 1
                else:
                    break  # If "Next" button not found, exit the loop
            except NoSuchElementException:
                break  # If "Next" button not found, exit the loop
    print(f"Scraped {len(articles)} articles for {company}.")
    return articles

In [21]:
driver = start_driver()

articles = {}
for company in companies:
    articles[company] = scrape_with_selenium(company, driver=driver, max_pages=3)
close_driver(driver)

Cookie consent accepted.
Scraping articles for Enviria...
Scraped 152 articles for Enviria.
Scraping articles for Enrego...
Scraped 24 articles for Enrego.
Scraping articles for HIH Invest...
Scraped 49 articles for HIH Invest.
Scraping articles for Merkle...
Scraped 37 articles for Merkle.


In [22]:
# Convert articles dictionary into a DataFrame
articles_df = pd.DataFrame()

for company, news in articles.items():
    temp_df = pd.DataFrame(news)
    temp_df['company'] = company  # Add a column for the company name if desired
    temp_df = temp_df.reindex(columns=['company', 'source', 'title', 'desc', 'date', 'link'])
    articles_df = pd.concat([articles_df, temp_df], ignore_index=True)


In [52]:
articles_df_unique = articles_df.drop_duplicates(subset=['company', 'title', 'desc'], inplace=False)

In [53]:
articles_df_unique

Unnamed: 0,company,source,title,desc,date,link
0,Enviria,EU-Startups,Frankfurt-based ENVIRIA secures €185 million t...,"ENVIRIA, Germany's leading commercial and indu...",29 Feb 2024,https://www.eu-startups.com/2024/02/frankfurt-...
1,Enviria,Sifted,Solar panel startup Enviria secures $200m from...,Frankfurt-based Enviria has secured $200m in e...,29 Feb 2024,https://sifted.eu/articles/enviria-blackrock-c...
2,Enviria,Tech.eu,Germany's Enviria targets commercial solar ene...,German solar startup Enviria raises over $200M...,29 Feb 2024,https://tech.eu/2024/02/29/germanys-enviria-ta...
3,Enviria,Renewables Now,Galileo sheds interest in Enviria to BlackRock...,Pan-European renewables developer Galileo Gree...,5 Mar 2024,https://renewablesnow.com/news/galileo-sheds-i...
4,Enviria,pv magazine International,BlackRock invests $200 million in Enviria,BlackRock has invested €183 million ($200.2 mi...,11 Mar 2024,https://www.pv-magazine.com/2024/03/11/blackro...
...,...,...,...,...,...,...
252,Merkle,Südkurier,(Anzeige) 10 Jahre Geba GmbH und ein Jahr Geba...,Rickenbach (psc) Die Firma GEBA GmbH mit Sitz ...,23 Mar 2019,https://www.suedkurier.de/region/hochrhein/ric...
253,Merkle,The Merkle News,Will Mining Cryptocurrency in the Desert Using...,Mining Bitcoin or any other cryptocurrency is ...,5 Jul 2017,https://themerkle.com/will-mining-cryptocurren...
254,Merkle,NOKZEIT,24. Dezember 2021 - Solarpark - „Made in Seckach“,Auf diesem Gelände soll der Solarpark entstehe...,24 Dec 2021,https://www.nokzeit.de/2021/12/24/solarpark-ma...
260,Merkle,Investopedia,Is Solar-Powered Cryptocurrency Mining the Nex...,In the search to make cryptocurrency mining pr...,7 Nov 2017,https://www.investopedia.com/news/solarpowered...


### Rule-based checking

In [73]:
articles_df_unique = articles_df.drop_duplicates(subset=['company', 'title', 'desc'], inplace=False)

keywords = [
    'invests in solar park', 'investment in solar park', 'solar park investment', 
    'investing in solar park', 'solar park', 'solar energy project', 
    'solar power plant', 'renewable energy investment', 'solar project', 'solar energy investment',
    'solar power', 'solarpark', 'investing in solar energy', 'investing in solar power'
]

# Function to check if the company is investing in solar parks
def check_solar_investment(title, desc):
    combined_text = f"{title} {desc}".lower()
    for keyword in keywords:
        if keyword in combined_text:
            return True
    return False

# Apply the function to the DataFrame and create a new column
articles_df_unique['investing_in_solarparks'] = articles_df_unique.apply(lambda row: check_solar_investment(row['title'], row['desc']), axis=1)

articles_df_unique.to_csv("results/rule_based_results.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_df_unique['investing_in_solarparks'] = articles_df_unique.apply(lambda row: check_solar_investment(row['title'], row['desc']), axis=1)


In [74]:
rule_based_results = articles_df_unique.groupby(['company', 'investing_in_solarparks']).size()
rule_based_results

company     investing_in_solarparks
Enrego      False                      17
            True                        7
Enviria     False                      40
            True                       14
HIH Invest  False                      13
            True                        2
Merkle      False                      28
            True                        4
dtype: int64

### NLP Based checking with SpaCy

In [79]:
# Define custom patterns for solar park investments
solar_patterns = [
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "park"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "project"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "energy"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "power"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "park"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "project"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "energy"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "power"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "initiative"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "farm"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "facility"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "installation"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "project"}, {"LOWER": "funding"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "energy"}, {"LOWER": "funding"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "power"}, {"LOWER": "funding"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "park"}, {"LOWER": "funding"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "initiative"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "farm"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "facility"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "installation"}, {"LOWER": "investment"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "energy"}, {"LOWER": "project"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "power"}, {"LOWER": "project"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "investment"}, {"LOWER": "fund"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "park"}, {"LOWER": "investment"}, {"LOWER": "fund"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "project"}, {"LOWER": "investment"}, {"LOWER": "fund"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "energy"}, {"LOWER": "investment"}, {"LOWER": "fund"}]},
    {"label": "SOLAR_INVESTMENT", "pattern": [{"LOWER": "solar"}, {"LOWER": "power"}, {"LOWER": "investment"}, {"LOWER": "fund"}]},
]

In [80]:
articles_df_unique = articles_df.drop_duplicates(subset=['company', 'title', 'desc'], inplace=False)

# Function to extract relevant information using spaCy
def extract_info_spacy(title, desc, solar_patterns=solar_patterns):
    # Load the spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Add the custom patterns to the model
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(solar_patterns)
    combined_text = f"{title} {desc}"
    doc = nlp(combined_text)
    investing_in_solarparks = False
    
    for ent in doc.ents:
        if ent.label_ == "SOLAR_INVESTMENT":
            investing_in_solarparks = True
    
    return investing_in_solarparks

# Apply the function to the DataFrame and create a new column
articles_df_unique['investing_in_solarparks'] = articles_df_unique.apply(lambda row: extract_info_spacy(row['title'], row['desc']), axis=1)

articles_df_unique.to_csv("results/nlp_based_results.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_df_unique['investing_in_solarparks'] = articles_df_unique.apply(lambda row: extract_info_spacy(row['title'], row['desc']), axis=1)


In [81]:
spacy_results = articles_df_unique.groupby(['company', 'investing_in_solarparks']).size()
spacy_results

company     investing_in_solarparks
Enrego      False                      15
            True                        9
Enviria     False                      34
            True                       20
HIH Invest  False                      13
            True                        2
Merkle      False                      24
            True                        8
dtype: int64

### Further parsing with NewsPaper
* Using gathered links from Google News to fetch article data from the news sites
* Extraction information with SpaCy
  

In [109]:
from newspaper import Article
articles_df_unique = articles_df.drop_duplicates(subset=['company', 'title', 'desc'], inplace=False)

# Function to fetch and parse content using newspaper
def fetch_and_parse(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        return str(e)
    


# Add a new column to store the fetched content
articles_df_unique['detailed_content'] = articles_df_unique['link'].apply(fetch_and_parse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles_df_unique['detailed_content'] = articles_df_unique['link'].apply(fetch_and_parse)


In [161]:
from nltk.corpus import stopwords

#nltk.download('stopwords')

# Function to add the final content to the DataFrame
# If the detailed content contains the the beginnig of description, use the detailed content
def add_final_content(row):
    desc = row['desc']
    detailed_content = row['detailed_content']

    if pd.isna(desc) or not isinstance(desc, str):
        desc = ""
    if pd.isna(detailed_content) or not isinstance(detailed_content, str):
        detailed_content = ""
    
    desc = desc.lower()
    detailed_content = detailed_content.lower()

    key_terms = desc.split()
    number_of_matches = 0
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_terms = [term for term in key_terms if term not in stop_words]
    # Check if the key terms are present in the detailed content
    for term in filtered_terms:
        if term in detailed_content:
            number_of_matches += 1
        # If more than 8 key terms are found in the detailed content, return the detailed content
        # I just used 8 as an arbitrary number, this can be adjusted based on the use case
        if number_of_matches >= 8:
            return detailed_content
    return desc

articles_df_unique['final_content'] = articles_df_unique.apply(add_final_content, axis=1)

In [163]:
# Function to extract information using spaCy
def extract_info(text, solar_patterns=solar_patterns):
    nlp = spacy.load("en_core_web_sm")

    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(solar_patterns)

    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "SOLAR_INVESTMENT":
            return True
    return False



# Apply the extraction function to the final_content column
articles_df_unique['investing_in_solarparks'] = articles_df_unique['final_content'].apply(extract_info)

articles_df_unique.to_csv("results/newspaper_results.csv", index=False)

In [164]:
newspaper_results = articles_df_unique.groupby(['company', 'investing_in_solarparks']).size()
newspaper_results

company     investing_in_solarparks
Enrego      False                       6
            True                       18
Enviria     False                      18
            True                       36
HIH Invest  False                      12
            True                        3
Merkle      False                      21
            True                       11
dtype: int64

In [165]:
print("Rule-based results: \n", rule_based_results)
print("SpaCy-based results: \n", spacy_results)
print("Newspaper-based results: \n", newspaper_results)

Rule-based results: 
 company     investing_in_solarparks
Enrego      False                      17
            True                        7
Enviria     False                      40
            True                       14
HIH Invest  False                      13
            True                        2
Merkle      False                      28
            True                        4
dtype: int64
SpaCy-based results: 
 company     investing_in_solarparks
Enrego      False                      15
            True                        9
Enviria     False                      34
            True                       20
HIH Invest  False                      13
            True                        2
Merkle      False                      24
            True                        8
dtype: int64
Newspaper-based results: 
 company     investing_in_solarparks
Enrego      False                       6
            True                       18
Enviria     False                     

as seen, using newspaper to gather more details about articles drastically changes the results. 

## LLM Based Approach
* Computationally consuming
* Can give more accurate results and extract more information