##### Urban Resilience Metric - Term lookup
This notebook aims to find words that correspond to the City Resilience Index(https://www.cityresilienceindex.org/#/city-profiles). The notebook uses Hugging Face - sentence transformers as part of the main method: https://huggingface.co/docs/hub/sentence-transformers It pulls unigram / bigram, search for top 100 terms using the 5 pdf provided. Using the result, we come up with listing of words we will use to compare to cities to result an urban resilience metrics.

In [1]:
!pip install sentence-transformers
!pip install torch torchvision
!pip install nltk
!pip install pdfplumber













In [2]:
import nltk
import numpy as np
import pdfplumber
import re
import string
from sentence_transformers import SentenceTransformer, util

In [3]:
# Download necessary data for nltk
nltk.download(["names", "stopwords", "punkt"])


[nltk_data] Downloading package names to
[nltk_data]     C:\Users\lifet\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lifet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lifet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import torch

# Define the model and check if GPU is available. If yes, move the model to GPU.
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('all-mpnet-base-v2').to(device)


In [5]:
# Extract text from pdf file
import pdfplumber
files = ["D:\Graduate School\Experience Expo\Code\Report PDF/Cali-Resilience-Strategy-English.pdf",
         "D:\Graduate School\Experience Expo\Code\Report PDF/Cape-Town-Resilience-Strategy-English.pdf",
         "D:\Graduate School\Experience Expo\Code\Report PDF/Pune-Resilience-Strategy-English.pdf",
         "D:\Graduate School\Experience Expo\Code\Report PDF/Seattle-Resilience-Strategy-English.pdf",
         "D:\Graduate School\Experience Expo\Code\Report PDF/Surat-Resilience-Strategy-English.pdf"] # Upload required pdfs and add paths to the list

# Initialize an empty string to store extracted text from all PDFs
text = ""

# Extract text from each PDF and append to the 'text' string
for filepath in files:
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            text += " " + page.extract_text()

# Convert the text to lowercase for consistency
text = text.lower()

In [10]:
# Function to clean the text
def clean_text(text):
    # Remove numbers
    text_nonum = re.sub(r'\d+', '', text)
    # Remove punctuation and convert to lowercase
    text_nopunct = "".join([char.lower() for char in text_nonum 
                            if char not in string.punctuation])
    # Remove any multiple spaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_no_doublespace

# Clean the aggregated text
text = clean_text(text)

In [13]:
# Define a list of indicators and compute embeddings for them
indicator_list = ["Effective co-ordination","government authorities", "government co-ordination"]
indicator_embeddings = [torch.tensor(model.encode(indicator)).to(device) for indicator in indicator_list]

In [15]:
# Tokenize the text and generate a list of unigrams (individual words) and bigrams (pairs of words)
raw_words: list[str] = nltk.word_tokenize(text)
words = [w.lower() for w in raw_words if w.isalpha()]
stopwords = nltk.corpus.stopwords.words("english")
words = [w for w in words if w not in stopwords and len(w) > 3]

text_new = nltk.Text(words)
finder_unigram = text_new.vocab()
finder_bigram = nltk.collocations.BigramCollocationFinder.from_words(raw_words)

# Initialize a list to store words/phrases and their relevance scores
scored_words = []

# Compute the relevance of each unigram by comparing its embedding to the embeddings of the indicators
for word, freq in finder_unigram.items():
    word_emb = torch.tensor(model.encode(word)).to(device)  # Convert to PyTorch tensor and move to device
    for indicator_emb in indicator_embeddings:
        s = (indicator_emb * word_emb).sum()  # Dot product using PyTorch tensors
        scored_words.append((s.item(), word))  # Convert tensor to scalar using .item()

# Compute the relevance of each bigram by comparing its embedding to the embeddings of the indicators
for word, freq in finder_bigram.ngram_fd.items():
    sentence = " ".join(word)
    word_emb = torch.tensor(model.encode(sentence)).to(device)  # Convert to PyTorch tensor and move to device
    for indicator_emb in indicator_embeddings:
        s = (indicator_emb * word_emb).sum()  # Dot product using PyTorch tensors
        scored_words.append((s.item(), sentence))  # Convert tensor to scalar using .item()

# Sort the scored words/phrases in descending order of relevance
scored_words.sort(reverse=True)

# Print the top 100 most relevant terms
for score, word in scored_words[:100]:
    print(word, score)
