# Dependencies

In [1]:
!pip install sentence-transformers




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Scraping the Web Page

In [28]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Set paths for geckodriver and Firefox binary
gecko_driver_PATH = './geckodriver.exe'
firefox_binary_PATH = r"C:\Program Files\Mozilla Firefox\firefox.exe" # Replace with firefox.exe path in your program files 

# Service and options for the Firefox driver
service = Service(executable_path=gecko_driver_PATH)
options = Options()
options.binary_location = firefox_binary_PATH

# Target website URL
site_url = "https://ispt.eu/projects/?theme-tag=heat"

# Initialize the Firefox WebDriver
driver = webdriver.Firefox(service=service, options=options)

# Open the website
driver.get(site_url)

# Wait until the articles are present in the DOM
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.TAG_NAME, "article"))
)

# Collect all article links first
article_links = []
articles = driver.find_elements(By.TAG_NAME, "article")

for article in articles:
    try:
        # Find the link in the article and add it to the list
        article_link = article.find_element(By.TAG_NAME, "a").get_attribute("href")
        article_links.append(article_link)
    except Exception as e:
        print(f"Error while collecting an article link: {str(e)}")

# Initialize a list to store the full text from each article's page
article_data = []

# Now visit each link to scrape the full text
for link in article_links:
    try:
        # Open the article page
        driver.get(link)
        
        # Wait until the article's full content is loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))  # Adjust this selector if needed
        )
        
        # Scrape the article's title and full text
        title = driver.find_element(By.TAG_NAME, "h1").text  # Adjust this to the title element if necessary
        full_article_text = driver.find_element(By.TAG_NAME, "body").text  # Adjust to target specific content if needed
        
        # Append the title, link, and full article text to the data list
        article_data.append({"Title": title, "Link": link, "Content": full_article_text})
    
    except Exception as e:
        print(f"Error while processing an article: {str(e)}")

# Close the driver after scraping
driver.quit()

# Create a DataFrame and save it as a CSV file
df = pd.DataFrame(article_data)
df.to_csv("scraped_articles.csv", index=False)




InvalidArgumentException: Message: binary is not a Firefox executable


Skip to content
ISPT
Institute for Sustainable Process Technology
Contact
Menu
Projects
COMPRESORP
Code
UH-20-10
Status
Completed
Start date
April 1, 2015
Upgrading low temperature waste water streams of separation processes with compression resorption heat pumps.
Goal
The goal of the COMPRESORP project is to upgrade low temperature waste streams from the process industry (e.g. cooling water from cooling towers) into valuable utility streams making use of compression-resorption heat pumps which operate in the wet regime.
Incentive
Thermal energy (heat) represents a large part of the global energy usage and about 43% of this energy is used for industrial applications. Ultimately, 20-50% of the energy used is lost via waste heat contained in hot exhaust gases and liquid streams. Nevertheless the share of waste heat recovery (upgraded energy) within the total energy used is still negligible.
Compression Resorption heat Pumps (CRHP) can be used to upgrade waste water streams to streams whi

Skip to content
ISPT
Institute for Sustainable Process Technology
Contact
Menu
Projects
ENCORE – nExt geNeration COmpRession hEat pump
Code
UH-30-03
Status
Ongoing
Start date
January 1, 2019
The ENCORE project, aims to significantly increase the uptake of heat pumps in the industry. To achieve this, the project will demonstrate enhancement of compression heat pump performance by using a pilot scale test unit.
In short:
A demonstrated enhancement of compression heat pump performance
Results of the pilot scale test unit
A look into wider market introduction
About ENCORE
The project consists of different parts that will altogether yield a series of technological improvements for high temperature compression heat pumps. The project results will describe the impact of various changes made to a regular compression heat pump. They will also provide insights for companies into the benefits and possible challenges when applying these heat pumps in their processes. All of these results will allo

In [3]:
df.head()

Unnamed: 0,Title,Link,Content
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Skip to content\nISPT\nInstitute for Sustainab...
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Skip to content\nISPT\nInstitute for Sustainab...
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Skip to content\nISPT\nInstitute for Sustainab...
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Skip to content\nISPT\nInstitute for Sustainab...
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Skip to content\nISPT\nInstitute for Sustainab...


# Cleaning the Data
- We Try the remove static site banners, extra unimportant sections.

In [13]:
import pandas as pd
import re

# Load the scraped CSV file
df = pd.read_csv('ProofFiles/scraped_articles.csv')

# Function to split text into sentences and remove redundant sentences
def extract_unique_sentences(content, all_sentences):
    # Split content into sentences using a simple regex
    sentences = content.split('\n')
    
    # Filter out sentences that have already been seen
    unique_sentences = [sentence for sentence in sentences if sentence not in all_sentences]
    
    # Add the new unique sentences to the global set
    all_sentences.update(unique_sentences)
    
    return " ".join(unique_sentences)

# Set to track all sentences we've seen
all_sentences_set = set()

# Apply the function to each row in the 'Content' column
df['Cleaned_Content'] = df['Content'].apply(lambda x: extract_unique_sentences(x, all_sentences_set))

# Set to track all sentences we've seen
# Cleaning the first exemple:
sentences = df.iloc[1]['Content'].split('\n')
all_sentences_set = set()
all_sentences_set = [sentence for sentence in sentences if sentence not in all_sentences_set]

sentences = df.iloc[0]['Content'].split('\n')
df.loc[0,'Cleaned_Content'] = " ".join([sentence for sentence in sentences if sentence not in all_sentences_set])

# Save the cleaned DataFrame to a new CSV
df.to_csv('cleaned_scraped_articles.csv', index=False)

# Display the cleaned DataFrame
df[['Title', 'Link', 'Cleaned_Content']].head()


Unnamed: 0,Title,Link,Cleaned_Content
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Flexible Efficient Electrification of Industri...
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,FUSE – FUll ScalE Industrial Heat Pump Using N...
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,COMTA – COMpact modular Thermo Acoustic heat p...
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,R-ACES – FRamework for Actual Cooperation on E...
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,FLEXSTEAM – Development of heat storage for in...


# Relevance Classification
- H1: Bert based approach isnt suitable for key words matching, and storng based classification | `Reject`
- H2: Dragon retiever Approach requires knowledge based context for matching , not suitable | `Reject`
- H3: GenAI with only prompt engineering isn't accurate -> requires fineTuning | `Reject`
- H4: TF-IDF and cosine_similarity metrics is simple, can be based on threasholding irrelevant exemples for fix a threshold | `Approved`
- H5: Simple PreTrained for embedding before cosine also simple and good with thresholding | `Approved`

In [15]:
df = pd.read_csv('ProofFiles/cleaned_scraped_articles.csv')
df

Unnamed: 0,Title,Link,Content,Cleaned_Content
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Skip to content\nISPT\nInstitute for Sustainab...,Flexible Efficient Electrification of Industri...
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Skip to content\nISPT\nInstitute for Sustainab...,FUSE – FUll ScalE Industrial Heat Pump Using N...
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Skip to content\nISPT\nInstitute for Sustainab...,COMTA – COMpact modular Thermo Acoustic heat p...
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Skip to content\nISPT\nInstitute for Sustainab...,R-ACES – FRamework for Actual Cooperation on E...
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Skip to content\nISPT\nInstitute for Sustainab...,FLEXSTEAM – Development of heat storage for in...
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,Skip to content\nISPT\nInstitute for Sustainab...,SPOT: Sustainable PrOcess heaTing UH-30-08 Jan...
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,Skip to content\nISPT\nInstitute for Sustainab...,StAgglop: Reducing energy use and material los...
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,Skip to content\nISPT\nInstitute for Sustainab...,"The Heat Is On HP-50-01 September 1, 2021 The ..."
8,COMPRESORP,https://ispt.eu/projects/compresorp/,Skip to content\nISPT\nInstitute for Sustainab...,"COMPRESORP UH-20-10 Completed April 1, 2015 Up..."
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,Skip to content\nISPT\nInstitute for Sustainab...,Steam and condensate quality WP-20-11 January ...


## For Relevance baseline we add 2 irrelvant articles for Thresholding

In [16]:
irr = '''CIRCPET – Circular PET trays for perishable food applications
CodeCP-50-11StatusOngoingStart dateMarch 1, 2023
Packaging is crucial to preserve and protect food that we are using daily. To ensure these qualities, PET trays are often used. However, they have been difficult to recycle. CIRCPET aims to make full recycling, transparent and economically competitive trays possible.

In short:

Tested and validated fully circular PET tray for (food) packaging
New design with improved recyclability, including developing sealing layers
Optimal cost-effectiveness in converting post-use trays into homogenous high quality rPET
Destined to grow, difficult to recycle
The market for PET trays is vast and rapidly growing, making them an essential part of the food supply chain. However, PET trays are currently difficult to recycle due to mixed materials and contamination. As the demand for recycled PET (rPET) in bottles increases, the availability of rPET for trays decreases. This creates an urgent need for the development of a closed-loop system specifically for PET trays.

This project aims to address this challenge by creating a fully circular PET tray solution that meets market demands while minimizing environmental impact.

Achieving circular PET trays
The project brings together multiple parties from the PET tray supply chain together with knowledge institutions. Together 100% circular PET trays will be developed. The project includes the creation of tested and validated circular PET trays that can be used as packaging for perishable food. This also includes films, lids, and labels that are easy to be removed or washed off the packaging. By focusing on mono-PET and limiting the presence of other compounds, the redesigned PET trays should be easier to separate and recycle. All of this should happen at a competitive price and quality.

How this will affect food packaging
Achieving a successful redesign of PET trays will contribute to maintaining valuable carbon in the plastic value chain. This results in a decrease of crude oil demand and less CO2 emissions from end of life incineration. The environmental benefit could be substantial, as tray-to-tray recycling is accompanied by the reduction of a waste stream that now amounts to approximately 900 kilotons per year, which is roughly equal to 3.5 million tons of CO2 emissions in Europe.

Stay involved in our quest for circularity
Join us in in our journey to sustainable packaging by staying up-to-date on project highlights on our LinkedIn page. Explore the Circular Plastics Initiative for more projects, and be a part of the movement towards a greener future.'''
irr2 = '''GRIP on Drying
CodeDR-20-09StatusCompletedStart dateJanuary 1, 2017
This project investigates how inline sensors can help improving product quality and efficiency in 5 industrial cases, using a large consortium of end users, technology suppliers. Together with suppliers of inline moisture sensors and experts the possibilities were investigated to get a GRIP on each drying process. Objective is the application of innovative (sensor) technology that meets the specific innovation needs of the company for drying technology.

In short:

GRIP aims to learn how inline sensors can improve product quality
But with wireless technology new risks are being introduced
So incorporating the human factor is essential when it comes to open innovation
Lower costs for wireless measuring systems
In an industry 4.0 inline sensors will be implemented more and more. Wiring and creating connection to the process control is very expensive. Costs of wireless measuring systems are much lower and can offer more flexibility.

However, with wireless technology new interdependencies and risks are being introduced. As a result of risk analyses recommendations are made for a good mitigation of risks in order to manage the risks there are.

Smart strategy to develop a learning community
One of the results of the GRIP on drying project is that there is a need for smart strategies to develop Learning Communities. Learning Communities are an important link to building and spreading knowledge through education, life long learning and other channels. Learning Communities are stronger when they are supported by an online community.

The success of a Learning Community depends on the objectives of the organisation and participants. Reflection on motivation, objectives and involvement of the participants is recommended. Also to set an objective during a certain period for drawing up of a white paper, a presentation, or a training (Marcelis, 2002).

Also intersting to read
The Topsectors introduced Life Long Learning & Development as a result of the Human Capital road map. It it a great instrument for The Netherlands to stay front runner in the field of knowledge and innovation.

Find more information in our article Experts choose interaction as preferred way of learning.

Grip Op Drogen - Graphic Life long learning
Incorporating the human factor is essential when it comes to open innovation. As a result ISPT introduced Learning Communities as an instrument to stimulate and facilitate knowledge exchange between practitioners, increase interaction among stakeholders, and strengthen collaborations.

For the GRIP project 2 workshops were organised for the Learning Community of Drying experts. A workshop regarding energy savings during thin film drying and a workshop regarding drying technology and heat pumps in drying processes. During this latter workshop the results of our project VERA were investigated regarding the opportunities for heat pumps in the Dutch industry.

Cyber Security
The digitization introduces a new vulnerability, for this reason the program televulnerability of Agentschap Telecom joined the consortium. Their booklet ‘Vijfstappenplan’ (Five step strategy) that was developed within the GRIP on drying project, is creating awareness, and how to assess dependencies and execute risk management.'''
irr3='''BRECSIT – Bio-based REsin CompoSIte Technologies
CodeCS-20-08StatusCompletedStart dateJune 1, 2019
The Bio-based Resin Composite Technologies (BRECSIT) project develops production technologies for high volume production of bio-based and recyclable Plantics-GX composites, thereby replacing conventional, fossil-based plastic composites.

In short:

Currently there are no 100% bio-based thermosets available on the market
Plantics-GX is a competitive alternative to plastics
BRECSIT develops the processes and conditions by which the GX-resin applications can be realized
Plantics-GX resins: an alternative to conventional plastics
Currently there are no 100% bio-based thermosets available on the market. This makes Plantics-GX resins unique, as it presents a cost and functionality competitive alternative to plastics, along with a fully circular product bio-life cycle. Sustainable composites can be formed by combining the GX-resins with bio-based fillers such as hemp fibers.

The challenge of this BRECSIT project is to develop the processes and conditions by which the GX-resin applications can be realized. With these new technologies in hand, high-end, high-volume products can be manufactured using Plantics GX-resin in the Netherlands and Europe. This will yield high quality jobs, major CO2 reductions, increased product safety and reduce the amount of plastics that end up in the environment.

Expected results
The targeted results include the properties and physical characteristics of the products and the know-how to produce on a larger scale. Furthermore, targeted results include knowledge about the critical parameters and process conditions that determine the applicability, reproducibility and material characteristics (including re-use/ recycling and biodegradability) with different fibers, fiber mats and fillers. Finally, the generic knowledge will be widely spread which will encourage the implementations of these technologies and the use of recyclable bio-resin composites.'''

In [17]:
import pandas as pd

# Example DataFrame
new_rows = pd.DataFrame({
    "Title": ["Irrelevant Case Study 1", "Irrelevant Case Study 2", "Irrelevant Case Study 3"],
    "Link": [None, None,None],
    "Content": [irr, irr2, irr3],
    "Cleaned_Content": [irr, irr2, irr3], # A clean Copy
})


# Append the new row using pd.concat
df = pd.concat([df, new_rows], ignore_index=True)

df


Unnamed: 0,Title,Link,Content,Cleaned_Content
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Skip to content\nISPT\nInstitute for Sustainab...,Flexible Efficient Electrification of Industri...
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Skip to content\nISPT\nInstitute for Sustainab...,FUSE – FUll ScalE Industrial Heat Pump Using N...
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Skip to content\nISPT\nInstitute for Sustainab...,COMTA – COMpact modular Thermo Acoustic heat p...
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Skip to content\nISPT\nInstitute for Sustainab...,R-ACES – FRamework for Actual Cooperation on E...
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Skip to content\nISPT\nInstitute for Sustainab...,FLEXSTEAM – Development of heat storage for in...
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,Skip to content\nISPT\nInstitute for Sustainab...,SPOT: Sustainable PrOcess heaTing UH-30-08 Jan...
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,Skip to content\nISPT\nInstitute for Sustainab...,StAgglop: Reducing energy use and material los...
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,Skip to content\nISPT\nInstitute for Sustainab...,"The Heat Is On HP-50-01 September 1, 2021 The ..."
8,COMPRESORP,https://ispt.eu/projects/compresorp/,Skip to content\nISPT\nInstitute for Sustainab...,"COMPRESORP UH-20-10 Completed April 1, 2015 Up..."
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,Skip to content\nISPT\nInstitute for Sustainab...,Steam and condensate quality WP-20-11 January ...


## TF-IDF + KeyWordsBAsed

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Define the keywords related to "decarbonization of industrial heat"
relevant_keywords = [
    "decarbonization", 
    "industrial heat", 
    "carbon capture", 
    "renewable energy", 
    "heat recovery", 
    "energy efficiency", 
    "greenhouse gas", 
    "carbon emissions", 
    "heat electrification", 
    "sustainable energy", 
    "fossil fuels", 
    "energy transition", 
    "biomass energy", 
    "carbon neutrality", 
    "hydrogen fuel", 
    "solar thermal", 
    "waste heat", 
    "low-carbon technologies", 
    "clean energy", 
    "climate action"
]

# Join all keywords into a single string
keywords_string = " ".join(relevant_keywords)


# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the content of the articles and the keywords
tfidf_matrix = vectorizer.fit_transform(df['Cleaned_Content'].tolist() + [keywords_string])

# Compute cosine similarity between the keywords and each article
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

# Set a threshold for classification (e.g., 0.1)
threshold = 0.1

# Classify articles based on the threshold
df['Cosine_Score_TF-IDF'] = [sim for sim in cosine_sim[0]]

df['Relevance_TF-IDF'] = ["Relevant" if sim >= threshold else "Irrelevant" for sim in cosine_sim[0]]

# Display the updated DataFrame
df[['Title', 'Link', 'Relevance_TF-IDF', 'Cosine_Score_TF-IDF']]



Unnamed: 0,Title,Link,Relevance_TF-IDF,Cosine_Score_TF-IDF
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Relevant,0.157824
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Relevant,0.120838
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Relevant,0.167908
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Irrelevant,0.084429
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Relevant,0.135338
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,Relevant,0.162443
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,Irrelevant,0.027362
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,Relevant,0.157658
8,COMPRESORP,https://ispt.eu/projects/compresorp/,Relevant,0.115061
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,Irrelevant,0.017461


- Looks like Irrelvent Articales are too far from 0.1 -> Good Threshold

## KeyWords Based + PreTrained

In [19]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



# Step 1: Define the keywords related to "decarbonization of industrial heat"
relevant_keywords = [
    "decarbonization", "industrial heat", "carbon capture", 
    "renewable energy", "heat recovery", "energy efficiency", 
    "greenhouse gas", "carbon emissions", "heat electrification", 
    "sustainable energy", "fossil fuels", "energy transition", 
    "biomass energy", "carbon neutrality", "hydrogen fuel", 
    "solar thermal", "waste heat", "low-carbon technologies", 
    "clean energy", "climate action"
]

# Join all keywords into a single string for embedding
keywords_string = " ".join(relevant_keywords)

# Step 2: Preprocess the text (remove stopwords, lemmatize)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply preprocessing to both articles and keywords
df['Cleaned_Content'] = df['Cleaned_Content'].apply(preprocess)
keywords_string = preprocess(keywords_string)

# Step 3: Use a pretrained language model (SentenceTransformer) for embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for articles and keywords
article_embeddings = model.encode(df['Cleaned_Content'].tolist())  # List of articles' embeddings
keyword_embedding = model.encode([keywords_string])  # Embedding for the combined keywords

# Step 4: Compute cosine similarity between the keyword embedding and article embeddings
cosine_sim = cosine_similarity(keyword_embedding, article_embeddings)  # Shape will be (1, num_articles)

# Step 5: Add cosine similarity scores to the dataframe
df['Cosine_Score_keywordBased'] = cosine_sim[0]  # Cosine similarity scores for each article

# Step 6: Classify articles based on relevance using a threshold (you can adjust this threshold as needed)
threshold = 0.3  # Chosen based on  relvance score made on a paper related to the Term 'decarbonization of industrial heat'
df['Relevance_keywordBased'] = ["Relevant" if sim >= threshold else "Irrelevant" for sim in df['Cosine_Score_keywordBased']]

# Display the updated DataFrame with Title, Link, Relevance, and Cosine Score
df[['Title', 'Link', 'Relevance_keywordBased', 'Cosine_Score_keywordBased']]


  from tqdm.autonotebook import tqdm, trange





Unnamed: 0,Title,Link,Relevance_keywordBased,Cosine_Score_keywordBased
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Relevant,0.491019
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Relevant,0.33086
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Relevant,0.426253
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Irrelevant,0.239151
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Relevant,0.438817
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,Relevant,0.424657
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,Irrelevant,0.221096
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,Relevant,0.457207
8,COMPRESORP,https://ispt.eu/projects/compresorp/,Relevant,0.474088
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,Relevant,0.320083


- Looks like Irrelvent Articales are too far from 0.3 -> Good Threshold

## String Based + PreTrained

In [20]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained model (e.g., all-MiniLM-L6-v2)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define relevant keywords as a combined string
keywords_string = "heat pump waste heat recovery electrification renewable energy chemicals pharmaceuticals manufacturing"

# Combine the `irr`, `irr2`, and the DataFrame content
texts = df.loc[:, 'Cleaned_Content'].values.tolist()

# Step 1: Generate embeddings for case studies and keywords
case_study_embeddings = model.encode(texts)  # Generate embeddings for each case study
keyword_embedding = model.encode([keywords_string])  # Embedding for the combined keywords

# Step 2: Compute cosine similarity between keyword embedding and case study embeddings
cosine_sim = cosine_similarity(keyword_embedding, case_study_embeddings)  # Shape will be (1, num_case_studies)

# Step 3: Threshold for relevance (you can adjust this threshold)
threshold = 0.25  # Define a similarity threshold for relevance

# Step 4: Classify case studies based on cosine similarity and store in lists
results = []
for idx, sim_score in enumerate(cosine_sim[0]):
    # If the case study is either `irr` or `irr2`, classify it as Irrelevant
    if texts[idx] == irr or texts[idx] == irr2:
        classification = "Irrelevant (Example)"
    else:
        classification = "Relevant" if sim_score >= threshold else "Irrelevant"
    df.loc[idx,'Cosine_Score_StringBased'] = sim_score
    df.loc[idx, 'Relevance_Score_StringBased'] = classification

df[['Title', 'Link', 'Relevance_Score_StringBased', 'Cosine_Score_StringBased']]



Unnamed: 0,Title,Link,Relevance_Score_StringBased,Cosine_Score_StringBased
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Relevant,0.470276
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Relevant,0.579829
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Relevant,0.47006
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Relevant,0.330126
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Relevant,0.54446
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,Relevant,0.507458
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,Irrelevant,0.223839
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,Relevant,0.551258
8,COMPRESORP,https://ispt.eu/projects/compresorp/,Relevant,0.548997
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,Relevant,0.30788


- Looks like Irrelvent Articales are too far from 0.3 -> Good Threshold

## Final Relevance DF

In [21]:
df

Unnamed: 0,Title,Link,Content,Cleaned_Content,Cosine_Score_TF-IDF,Relevance_TF-IDF,Cosine_Score_keywordBased,Relevance_keywordBased,Cosine_Score_StringBased,Relevance_Score_StringBased
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Skip to content\nISPT\nInstitute for Sustainab...,Flexible Efficient Electrification Industrial ...,0.157824,Relevant,0.491019,Relevant,0.470276,Relevant
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,Skip to content\nISPT\nInstitute for Sustainab...,FUSE – FUll ScalE Industrial Heat Pump Using N...,0.120838,Relevant,0.33086,Relevant,0.579829,Relevant
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,Skip to content\nISPT\nInstitute for Sustainab...,COMTA – COMpact modular Thermo Acoustic heat p...,0.167908,Relevant,0.426253,Relevant,0.47006,Relevant
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,Skip to content\nISPT\nInstitute for Sustainab...,R-ACES – FRamework Actual Cooperation Energy S...,0.084429,Irrelevant,0.239151,Irrelevant,0.330126,Relevant
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,Skip to content\nISPT\nInstitute for Sustainab...,FLEXSTEAM – Development heat storage industria...,0.135338,Relevant,0.438817,Relevant,0.54446,Relevant
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,Skip to content\nISPT\nInstitute for Sustainab...,SPOT: Sustainable PrOcess heaTing UH-30-08 Jan...,0.162443,Relevant,0.424657,Relevant,0.507458,Relevant
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,Skip to content\nISPT\nInstitute for Sustainab...,StAgglop: Reducing energy use material loss be...,0.027362,Irrelevant,0.221096,Irrelevant,0.223839,Irrelevant
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,Skip to content\nISPT\nInstitute for Sustainab...,"Heat HP-50-01 September 1, 2021 Heat aim make ...",0.157658,Relevant,0.457207,Relevant,0.551258,Relevant
8,COMPRESORP,https://ispt.eu/projects/compresorp/,Skip to content\nISPT\nInstitute for Sustainab...,"COMPRESORP UH-20-10 Completed April 1, 2015 Up...",0.115061,Relevant,0.474088,Relevant,0.548997,Relevant
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,Skip to content\nISPT\nInstitute for Sustainab...,"Steam condensate quality WP-20-11 January 1, 2...",0.017461,Irrelevant,0.320083,Relevant,0.30788,Relevant


### Perform a Popularity classification
- If majority is 'Relevant' -> 'Relevant', else 'Irrelevant'.

In [22]:
Relevance_columns = df.columns[df.columns.str.startswith('Relevance')].tolist()
Relevance_columns

['Relevance_TF-IDF', 'Relevance_keywordBased', 'Relevance_Score_StringBased']

In [25]:
df['Final_Relevance_Classification'] = df[Relevance_columns].apply(
    lambda x: "Relevant" if (x == "Relevant").sum() > len(x) / 2 else "Irrelevant", axis=1
)

In [26]:
df_relevant = df[df.Final_Relevance_Classification == 'Relevant'][['Title', 'Link', 'Cleaned_Content']]
df_relevant

Unnamed: 0,Title,Link,Cleaned_Content
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,Flexible Efficient Electrification Industrial ...
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,FUSE – FUll ScalE Industrial Heat Pump Using N...
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,COMTA – COMpact modular Thermo Acoustic heat p...
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,FLEXSTEAM – Development heat storage industria...
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,SPOT: Sustainable PrOcess heaTing UH-30-08 Jan...
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,"Heat HP-50-01 September 1, 2021 Heat aim make ..."
8,COMPRESORP,https://ispt.eu/projects/compresorp/,"COMPRESORP UH-20-10 Completed April 1, 2015 Up..."
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,"Steam condensate quality WP-20-11 January 1, 2..."
10,ENCORE – nExt geNeration COmpRession hEat pump,https://ispt.eu/projects/encore/,ENCORE – nExt geNeration COmpRession hEat pump...
11,LESSON – Oil free compressor for ammonia based...,https://ispt.eu/projects/lesson/,LESSON – Oil free compressor ammonia based hig...


In [27]:
df_relevant.to_csv("Relevant_caseStudies.csv", index=False)


# End Notebook