# Accelerating Cleantech Advancements through NLP-Powered Text Mining and Knowledge Extraction

Group: Marusa Storman, Vignesh Govindaraj, Pradip Ravichandran

## Stage 2: Advanced Embedding Models Training and Analysis

### Data Preparation for Embeddings

In [1]:
import sys
import os

# Get the directory of the current notebook
notebook_dir = os.getcwd()

# Change current working directory to where the notebook resides
os.chdir(notebook_dir)

# List of required libraries
required_libraries = [
    'pandas'
]

# Check if each library is installed, if not, install it
for lib in required_libraries:
    try:
        __import__(lib)
    except ImportError:
        print(f"Installing {lib}...")
        !"{sys.executable}" -m pip install {lib}

In [2]:
#import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Jupyter config
%config InteractiveShell.ast_node_interactivity = 'all'

# Additional setup for seaborn
# sns.set(color_codes=True)
# sns.set_style("whitegrid")

# Download needed NLTK's resources
# nltk.download('punkt')
# nltk.download('stopwords')

In [3]:
# Get the preprocessed data from stage 1
google_patent_original = pd.read_csv("Data/google_patent_en_preprocessed.csv")
media_original = pd.read_csv("Data/ct_media_preprocessed.csv")
media_evaluation_original = pd.read_csv("Data/ct_evaluation_preprocessed.csv")

In [4]:
# This function will provide with more useful information:
def analyze_column(df, has_list=False):
    info = pd.DataFrame({
        'Data Type': df.dtypes,
        'Number of Entries': df.count(),
        'Missing/None Count': df.isna().sum(),
        'Uniqueness': df.nunique()
    })
    
    return info

print("Google Patent Dataset:")
google_patent_original.head()
analyze_column(google_patent_original)
print("\nNumber of duplicate rows:", media_original.duplicated().sum())

Google Patent Dataset:


Unnamed: 0,publication_number,country_code,publication_date,title_localized_text,abstract_localized_text,title_tokens,abstract_tokens,title_token_count,abstract_token_count
0,US-2022239235-A1,US,2022-07-28,adaptable dcac inverter drive system operation,disclosed adaptable dcac inverter system opera...,"['adapt', 'dcac', 'invert', 'drive', 'system',...","['disclos', 'adapt', 'dcac', 'invert', 'system...",7,64
1,US-2022239251-A1,US,2022-07-28,system providing energy single contiguous sola...,accordance example embodiment solar energy sys...,"['system', 'provid', 'energi', 'singl', 'conti...","['accord', 'exampl', 'embodi', 'solar', 'energ...",18,92
2,US-11396827-B2,US,2022-07-26,control method optimizing solartopower efficie...,control method optimizing solartopower efficie...,"['control', 'method', 'optim', 'solartopow', '...","['control', 'method', 'optim', 'solartopow', '...",15,149
3,CN-114772674-A,CN,2022-07-22,lowcarbon running saline wastewater treatment ...,invention discloses system method treating low...,"['lowcarbon', 'run', 'salin', 'wastewat', 'tre...","['invent', 'disclos', 'system', 'method', 'tre...",15,226
4,CN-217026795-U,CN,2022-07-22,water ecological remediation device convenient...,utility model discloses water ecological resto...,"['water', 'ecolog', 'remedi', 'devic', 'conven...","['util', 'model', 'disclos', 'water', 'ecolog'...",7,252


Unnamed: 0,Data Type,Number of Entries,Missing/None Count,Uniqueness
publication_number,object,13412,0,13351
country_code,object,13412,0,29
publication_date,object,13412,0,158
title_localized_text,object,13412,0,12427
abstract_localized_text,object,13412,0,13237
title_tokens,object,13412,0,12424
abstract_tokens,object,13412,0,13235
title_token_count,int64,13412,0,30
abstract_token_count,int64,13412,0,282



Number of duplicate rows: 0


In [5]:
print("Media Dataset:")
media_original.head()
analyze_column(media_original)
print("\nNumber of duplicate rows:", media_original.duplicated().sum())

Media Dataset:


Unnamed: 0,title,date,content,domain,title_tokens,content_tokens,title_token_count,content_token_count
0,qatar slash emissions lng expansion advances,2021-01-13,qatar petroleum qp targeting aggressive cuts g...,energyintel,"['qatar', 'slash', 'emiss', 'lng', 'expans', '...","['qatar', 'petroleum', 'qp', 'target', 'aggres...",8,442
1,india launches first 700 mw phwr,2021-01-15,nuclear power corp india ltd npcil synchronize...,energyintel,"['india', 'launch', 'first', '700', 'mw', 'phwr']","['nuclear', 'power', 'corp', 'india', 'ltd', '...",7,538
2,new chapter uschina energy trade,2021-01-20,new president joe biden took office week uschi...,energyintel,"['new', 'chapter', 'uschina', 'energi', 'trade']","['new', 'presid', 'joe', 'biden', 'took', 'off...",6,706
3,japan slow restarts cast doubt 2030 energy plan,2021-01-22,slow pace japanese reactor restarts continues ...,energyintel,"['japan', 'slow', 'restart', 'cast', 'doubt', ...","['slow', 'pace', 'japanes', 'reactor', 'restar...",9,687
4,nyc pension funds divest fossil fuel shares,2021-01-25,two new york citys largest pension funds say d...,energyintel,"['nyc', 'pension', 'fund', 'divest', 'fossil',...","['two', 'new', 'york', 'citi', 'largest', 'pen...",8,394


Unnamed: 0,Data Type,Number of Entries,Missing/None Count,Uniqueness
title,object,9593,0,9564
date,object,9593,0,967
content,object,9593,0,9587
domain,object,9593,0,19
title_tokens,object,9593,0,9563
content_tokens,object,9593,0,9587
title_token_count,int64,9593,0,25
content_token_count,int64,9593,0,1782



Number of duplicate rows: 0


In [6]:
print("Media Evaluation Dataset:")
media_evaluation_original.head()
analyze_column(media_evaluation_original)
print("\nNumber of duplicate rows:", media_evaluation_original.duplicated().sum())

Media Evaluation Dataset:


Unnamed: 0,example_id,question_id,question,relevant_chunk,domain,question_tokens,relevant_chunk_tokens,question_token_count,relevant_chunk_token_count
0,1,1,innovation behind leclanches new method produc...,leclanche said developed environmentally frien...,sgvoice.net,"['innov', 'behind', 'leclanch', 'new', 'method...","['leclanch', 'said', 'develop', 'environment',...",12,36
1,2,2,eus green deal industrial plan,green deal industrial plan bid eu make net zer...,sgvoice.net,"['eu', 'green', 'deal', 'industri', 'plan']","['green', 'deal', 'industri', 'plan', 'bid', '...",8,47
2,3,2,eus green deal industrial plan,european counterpart inflation reduction act i...,pv-magazine.com,"['eu', 'green', 'deal', 'industri', 'plan']","['european', 'counterpart', 'inflat', 'reduct'...",8,35
3,4,3,four focus areas eus green deal industrial plan,new plan fundamentally focused four areas pill...,sgvoice.net,"['four', 'focu', 'area', 'eu', 'green', 'deal'...","['new', 'plan', 'fundament', 'focus', 'four', ...",13,42
4,5,4,cooperation gm honda fuel cell vehicles start,caught eye new hookup gm honda honda hammering...,cleantechnica.com,"['cooper', 'gm', 'honda', 'fuel', 'cell', 'veh...","['caught', 'eye', 'new', 'hookup', 'gm', 'hond...",13,60


Unnamed: 0,Data Type,Number of Entries,Missing/None Count,Uniqueness
example_id,int64,23,0,23
question_id,int64,23,0,21
question,object,23,0,21
relevant_chunk,object,23,0,23
domain,object,23,0,6
question_tokens,object,23,0,21
relevant_chunk_tokens,object,23,0,23
question_token_count,int64,23,0,12
relevant_chunk_token_count,int64,23,0,18



Number of duplicate rows: 0


In [7]:
# Get all the rows with an unique country_code
class_counts = google_patent_original['country_code'].value_counts()
single_instances = class_counts[class_counts == 1].index.tolist()

# Update country_code for single-instance classes
google_patent_original.loc[google_patent_original['country_code'].isin(single_instances), 'country_code'] = 'OT'  # OT = Other

In [8]:
# Split patent data into training and validation sets, country code is been splitted equal
patent_train, patent_val = train_test_split(google_patent_original, test_size=0.2, stratify=google_patent_original['country_code'], random_state=42)

# Split media data into training and validation sets, domain is been splitted equal
media_train, media_val = train_test_split(media_original, test_size=0.2, stratify=media_original['domain'], random_state=42)


### Word Embedding Training

### Sentence Embedding Training

### Embedding Model Evaluation