In [71]:
import pandas as pd
import re

results_df = pd.read_csv('results/newspaper_results.csv')

In [72]:
def extract_investment_amount(content):
    # Regex patterns to extract investment amounts
    # Therse are generated with GPT
    investment_patterns = [
        r'\$\d+(?:,\d{3})*(?:\.\d+)?(?:m|b)?',  # Dollar amounts with commas, optionally followed by m or b
        r'€\d+(?:,\d{3})*(?:\.\d+)?(?:m|b)?',   # Euro amounts with commas, optionally followed by m or b
        r'\$\d+(?:,\d{3})*(?:\.\d+)?\s+m',      # Dollar amounts in millions with space
        r'\$\d+(?:,\d{3})*(?:\.\d+)?\s+b',      # Dollar amounts in billions with space
        r'€\d+(?:,\d{3})*(?:\.\d+)?\s+m',       # Euro amounts in millions with space
        r'€\d+(?:,\d{3})*(?:\.\d+)?\s+b',       # Euro amounts in billions with space
        r'\d+(?:,\d{3})*(?:\.\d+)?\s+dollars',  # Dollar amounts in words
        r'\d+(?:,\d{3})*(?:\.\d+)?\s+euros',    # Euro amounts in words
        r'\d+(?:,\d{3})*(?:\.\d+)?\s+million',  # Million amounts in words
        r'\d+(?:,\d{3})*(?:\.\d+)?\s+billion'   # Billion amounts in words
    ]
    investments = []
    for pattern in investment_patterns:
        matches = re.findall(pattern, content)
        investments.extend(matches)
    return investments

results_df['investments'] = results_df['final_content'].apply(extract_investment_amount)

In [73]:
def extract_solar_energy_info(content):
    # Regex patterns to extract solar energy production amounts
    # Therse are generated with GPT
    energy_patterns = [
    r'solar energy production\s*[:=]?\s*(\d+(\.\d+)?\s*(megawatts|gigawatts|kw|mw|gw))',
    r'produces (\d+(\.\d+)?)\s*(megawatts|gigawatts|kw|mw|gw) of solar energy',
    r'solar power\s*[:=]?\s*(\d+(\.\d+)?)\s*(megawatts|gigawatts|kw|mw|gw)',
    r'(\d+(\.\d+)?)\s*(megawatts|gigawatts|kw|mw|gw) of solar energy production',
    r'solar\s*(\d+(\.\d+)?)\s*(megawatts|gigawatts|kw|mw|gw)',
    r'(\d+(\.\d+)?)\s*(megawatts|gigawatts|kw|mw|gw)\s*solar',
    r'(\d+(\.\d+)?)\s*(megawatts|gigawatts|kw|mw|gw)',

    r'(\d+(\.\d+)?\s*megawatts)',
    r'(\d+(\.\d+)?\s*gigawatts)',
    r'(\d+(\.\d+)?\s*kilowatts)',
    r'(\d+(\.\d+)?\s*mw)',
    r'(\d+(\.\d+)?\s*gw)',
    r'(\d+(\.\d+)?\s*kw)'
]

    
    solar_energy = []
    for pattern in energy_patterns:
        matches = re.findall(pattern, content)
        solar_energy.extend(matches)
    
    return solar_energy

results_df['solar_energy'] = results_df['final_content'].apply(extract_solar_energy_info)

In [74]:
results_df.to_csv('results/final_results.csv', index=False)

In [75]:
results_df

Unnamed: 0,company,source,title,desc,date,link,detailed_content,final_content,investing_in_solarparks,investments,solar_energy
0,Enviria,EU-Startups,Frankfurt-based ENVIRIA secures €185 million t...,"ENVIRIA, Germany's leading commercial and indu...",29 Feb 2024,https://www.eu-startups.com/2024/02/frankfurt-...,"ENVIRIA, Germany’s leading commercial and indu...","enviria, germany’s leading commercial and indu...",True,"[$11, €185, $11 b, €185 m, 185 million, 11 bil...","[(2.3, .3, gw), (1.7, .7, gw), (2.3gw, .3), (1..."
1,Enviria,Sifted,Solar panel startup Enviria secures $200m from...,Frankfurt-based Enviria has secured $200m in e...,29 Feb 2024,https://sifted.eu/articles/enviria-blackrock-c...,Article `download()` failed with 403 Client Er...,frankfurt-based enviria has secured $200m in e...,True,[$200m],[]
2,Enviria,Tech.eu,Germany's Enviria targets commercial solar ene...,German solar startup Enviria raises over $200M...,29 Feb 2024,https://tech.eu/2024/02/29/germanys-enviria-ta...,Frankfurt-based Enviria has secured over $200 ...,german solar startup enviria raises over $200m...,True,[$200m],[]
3,Enviria,Renewables Now,Galileo sheds interest in Enviria to BlackRock...,Pan-European renewables developer Galileo Gree...,5 Mar 2024,https://renewablesnow.com/news/galileo-sheds-i...,Article `download()` failed with 403 Client Er...,pan-european renewables developer galileo gree...,False,[],[]
4,Enviria,pv magazine International,BlackRock invests $200 million in Enviria,BlackRock has invested €183 million ($200.2 mi...,11 Mar 2024,https://www.pv-magazine.com/2024/03/11/blackro...,BlackRock has agreed to invest €183 million in...,blackrock has invested €183 million ($200.2 mi...,False,"[$200.2, €183, $200.2 m, €183 m, 183 million, ...",[]
...,...,...,...,...,...,...,...,...,...,...,...
120,Merkle,Südkurier,(Anzeige) 10 Jahre Geba GmbH und ein Jahr Geba...,Rickenbach (psc) Die Firma GEBA GmbH mit Sitz ...,23 Mar 2019,https://www.suedkurier.de/region/hochrhein/ric...,Rickenbach (psc) Die Firma GEBA GmbH mit Sitz ...,rickenbach (psc) die firma geba gmbh mit sitz ...,False,[],[]
121,Merkle,The Merkle News,Will Mining Cryptocurrency in the Desert Using...,Mining Bitcoin or any other cryptocurrency is ...,5 Jul 2017,https://themerkle.com/will-mining-cryptocurren...,About The Author\n\nJP Buntinx is a FinTech an...,mining bitcoin or any other cryptocurrency is ...,False,[],[]
122,Merkle,NOKZEIT,24. Dezember 2021 - Solarpark - „Made in Seckach“,Auf diesem Gelände soll der Solarpark entstehe...,24 Dec 2021,https://www.nokzeit.de/2021/12/24/solarpark-ma...,Auf diesem Gelände soll der Solarpark entstehe...,auf diesem gelände soll der solarpark entstehe...,False,[],"[(7, , mw), (7 mw, )]"
123,Merkle,Investopedia,Is Solar-Powered Cryptocurrency Mining the Nex...,In the search to make cryptocurrency mining pr...,7 Nov 2017,https://www.investopedia.com/news/solarpowered...,Cryptocurrency mining is a difficult and costl...,in the search to make cryptocurrency mining pr...,True,[],[]


It turned out all of the companies are investing in solar. Even the HIH Invest Real Estate Austria GmbH invested in solarpark shows an article published 21 Mar 2024. Link: https://realassets.ipe.com/news/hih-invest-buys-solar-park-in-southern-spain/10072345.article

## Final Aggregations
* We can convert all findings into numberical values and aggregate them to derive a final number for both inversment amount and generated energy
* This numbers will not be reliable but can be useful to have an idea
* For the labels, we can just take the max
* However, due to my exams I cannot spend more time on this task. Results still can be inspected manually from the .csv file