In [1]:
from urllib.request import Request, urlopen
import re
from bs4 import BeautifulSoup
import spacy
import requests

# Web Scrapping

In [2]:
def get_google_news_results(query):
    headers = {
        'User-Agent': 'Mozilla/5.0 Chrome/91.0.4472.124 Safari/537.36'}
    url = f'https://www.google.com/search?q={query}&tbm=nws'
    response = Request(url, headers=headers)
    webpage = urlopen(response).read()
    return webpage

def parse_google_news_results(company_name, webpage):
    articles = []
    with requests.Session() as c:
        soup = BeautifulSoup(webpage, 'html5lib')
        for item in soup.find_all('div', class_='Gx5Zad fP1Qef xpd EtOod pkphOe'):
            link = item.find('a', href=True)['href']
            link = re.search(r'url\?q=(.+?)&', link).group(1)
            articles.append({
                'link': link,
                'language': 'de'
            })
    return articles

def scrape_article_content(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text() for para in paragraphs])
        return content
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None



company_names = ["ENVIRIA+Energy+Holding+GmbH", "ENREGO+Energy+GmbH", "HIH+Invest+Real+Estate+Austria+GmbH", "Merkle+Germany+GmbH"]

company_dict = {}
for company_name in company_names:
    query = f"{company_name}+solarpark+investieren"
    webpage = get_google_news_results(query)
    company_articles = parse_google_news_results(company_name, webpage)
    for article in company_articles:
        content = scrape_article_content(article['link'])
        article['content'] = content
    company_dict[company_name] = company_articles

#### We can also use pygooglenews instead of all this

# Using NLP model to extract the relavant informations

In [6]:
# Adding a dummy company to test the logic of the function
company_dict["XYZ"] = [{
    'link': 'xyz.abc',
    'content': """
                Das Unternehmen ABC gab kürzlich eine bedeutende Investition 
                in einen neuen Solarpark bekannt, der voraussichtlich 150 Megawatt (MW) 
                erneuerbare Energie erzeugen wird. Dieses 120-Millionen-Dollar-Projekt 
                erstreckt sich über eine Fläche von 500 Hektar und wird voraussichtlich 
                jährlich etwa 45.000 Haushalte mit Strom versorgen. Der Solarpark wird über 600.000 
                Solarmodule verfügen und modernste Photovoltaiktechnologie nutzen, um die Effizienz zu 
                maximieren. Das Unternehmen ABC geht davon aus, dass der Solarpark bis Ende 2025 die 
                Kohlenstoffemissionen um 200.000 Tonnen pro Jahr reduzieren wird, was seinem Engagement 
                für Nachhaltigkeit entspricht und einen wesentlichen Beitrag zu den regionalen Zielen für 
                saubere Energie leistet. Dieser strategische Schritt unterstreicht das Engagement von ABC,
                im Bereich der erneuerbaren Energien führend zu sein und eine umweltfreundlichere Zukunft zu
                fördern.""",
    'language': 'de'
}]

nlp = spacy.load('de_core_news_sm')

def extract_investment_details(company_name, text, nlp):
    doc = nlp(text)
    investments = {"equity_checks": [], "megawatts": [], "solarpark_investment": [], "sentiment": ""}

    # Extract entities
    for ent in doc.ents:
        if ent.label_ == "MONEY":
            investments["equity_checks"].append(ent.text)
        if ent.label_ == "QUANTITY":
            if "MW" in ent.text or "megawatts" in ent.text:
                investments["megawatts"].append(ent.text)
        if "solarpark" in ent.text.lower() or "solar park" in ent.text.lower():
            investments["solarpark_investment"] = "Present"

    investments["equity_checks"] = "N/A" if len(investments["equity_checks"]) == 0 else investments["equity_checks"]
    investments["megawatts"] = "N/A" if len(investments["megawatts"]) == 0 else investments["megawatts"]
    investments["solarpark_investment"] = "N/A" if len(investments["solarpark_investment"]) == 0 else investments["solarpark_investment"]
        
        
    
    # Sentiment analysis. However, this provides very rough estimate.
    # Alternatively, we can use RAG to ask in the specified text whether company has invested.
    text_blob = TextBlob(text)
    sentiment = text_blob.sentiment.polarity
    if sentiment > 0:
        investments["sentiment"] = "Positive"
    elif sentiment < 0:
        investments["sentiment"] = "Negative"
    else:
        investments["sentiment"] = "Neutral"
    
    return investments

for company_name in company_dict.keys():
    for article in company_dict[company_name]:
        content = article['content']
        investment_details = extract_investment_details(company_name = company_name, text=content, nlp = nlp)
        if investment_details['solarpark_investment'] == "Present":
            print(f"Company: {company_name}, Investment Details: {investment_details}, Link: {article['link']}")

Company: ENVIRIA+Energy+Holding+GmbH, Investment Details: {'equity_checks': 'N/A', 'megawatts': 'N/A', 'solarpark_investment': 'Present', 'sentiment': 'Positive'}, Link: https://www.solarserver.de/2023/09/22/neuer-vorstand-leitet-bundesverband-neue-energiewirtschaft-bne/
Company: XYZ, Investment Details: {'equity_checks': 'N/A', 'megawatts': 'N/A', 'solarpark_investment': 'Present', 'sentiment': 'Neutral'}, Link: xyz.abc
