# Model Application

Finally we have our model *fine-tuned* and ready to work, but there is a drawback for transformers (*word-embedding*) we have not mentioned before: **Transformers** performance decreases with the length of the text that is passed through it. This is not critical but we will try to fix it by **reducing the number of words** of the texts *removing stopwords* and then not applying the model once, but applying it for every **subsentence** defined as all the words between the '**.**' and '**,**' characters. With this we aim to **reduce the vagueness** of the model for long texts.

## Environment

### Libraries

In [1]:
# Base
import warnings
import numpy as np
import pandas as pd
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)

# Visualization
from IPython.display import Markdown
import ipywidgets as widgets
from plotly.graph_objs import FigureWidget
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm

#Sentence Transformes framework (HuggingFace)
from sentence_transformers import SentenceTransformer

# Cosine similarity
from sklearn.metrics.pairwise  import cosine_similarity

# NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stop_words
sp = spacy.load('en_core_web_sm')

### Load Models & Indicators

In [2]:
roberta_stsb = SentenceTransformer('stsb-roberta-large')
encoded_indicators = pd.read_csv('Indicators/encoded_indicators.csv').set_index('indicator')
encoded_indicators.columns =[int(c) for c in encoded_indicators.columns]

with open('Files/fitted_pca.pickle', "rb") as file:
    pca = pickle.load(file)
    
with open('Files/commitment_texts.pickle', "rb") as file:
    commitment_texts = pickle.load(file)

### Functions

In [3]:
def remove_stopwords(text: str):
    
    """
    Removes stopwords from a given text
    """ 
    
    return ' '.join([token.text for token in sp(text) if not token.is_stop]).strip()


def base_encoding(sentences: list):
    
    """
    Encodes a given list of sentences to match 
    the dimensions of the encoded indicators
    """
    encoded_sentences = pca.transform(roberta_stsb.encode(pd.Series(sentences).apply(remove_stopwords)))
    return pd.DataFrame(encoded_sentences, index=sentences)


def match_sentences_indicators(sentences: list):
    
    """
    Matches given sentences to encoded
    indicators in a "correlation" matrix
    """
    
    final_matrix = pd.concat([encoded_indicators, base_encoding(sentences)])
    return pd.DataFrame(cosine_similarity(final_matrix), index=final_matrix.index, columns=final_matrix.index)


def sentence_top_matches(matched_sentences: list,
                         sentences_matrix: pd.DataFrame = None,
                         valid_sentences: list = list(encoded_indicators.index),
                         n_top=5):
    
    """
    Applies the previously defined functions to return 
    the top matches of the given list of sentences
    """
    
    if not sentences_matrix:
        sentences_matrix = match_sentences_indicators(matched_sentences)
    
    matched_sentences = sentences_matrix.loc[matched_sentences][valid_sentences].drop_duplicates()
    matched_sentences = matched_sentences.unstack().sort_values(ascending=False)
    final_matched_sentences = matched_sentences[matched_sentences < 0.999].groupby(level=1).head(n_top).reset_index()
    final_matches = final_matched_sentences.groupby("level_1").agg(list)
    
    indexes = pd.DataFrame(final_matches["level_0"].to_list())
    indexes.columns = ['top_match_' + str(c) for c in indexes.columns]
    indexes.index = final_matches.index
    
    similarities = pd.DataFrame(final_matches[0].to_list())
    similarities.columns = ['similarity_' + str(c) for c in similarities.columns]  
    similarities.index = final_matches.index
    
    return pd.concat([final_matches.drop(columns=["level_0", 0]), pd.concat([indexes, similarities], axis=1)], axis=1)


def remove_subtext(text_list: list, minimum_length: int):
    
    """
    Filters strings with less words than the given number 
    (minimum_length) from a given list of strings (text_list)
    """
    
    return [text.strip() for text in text_list if len(text.strip().split(" ")) >= minimum_length]


def process_sentences(text_list: list, minimum_length: int=3):
    
    """
    Processes the strings within the given list with 
    the remove_subtext function splitting the
    original list when finding a dot or a coma
    """
    
    return pd.Series(text_list).str.replace(",", ".").str.split(".").apply(lambda x: remove_subtext(x, minimum_length))


def subsentences_top_matches(text_list: list,
                             minimum_length: int=3,
                             n_top_calc:int=10):
    
    """
    
    """
    
    text_dataframe = pd.DataFrame(pd.Series(text_list, name='text'))
    exploded_df = pd.DataFrame(process_sentences(text_list, minimum_length).explode()).reset_index().set_index(0)
    exploded_df.index.names = ['sentence']
    
    final_df = text_dataframe.merge(exploded_df, left_index=True, right_on='index').drop(columns='index')
    final_df = final_df[final_df.index.notnull()]
    
    return final_df.join(sentence_top_matches(list(final_df.index), n_top=n_top_calc))


def soft_voting_classifier(series: pd.Series,
                           n_top:int=3):
    
    """
    Ranks the highest similarity matches for a  
    given series containing all of their values
    """
    
    values = pd.DataFrame(series).apply(pd.Series.explode).reset_index()
    
    final_top_matches = {}
    
    matches = values[values["index"].str.contains('top_match')][series.name]
    similarities = values[values["index"].str.contains('similarity')][series.name]
    
    for match, similarity in zip(matches, similarities):
        if match not in final_top_matches:
            final_top_matches[match] = similarity
        else:
            final_top_matches[match] += similarity
            
    processed_dict = pd.Series(final_top_matches).sort_values(ascending=False)[:n_top].reset_index()
    processed_dict = processed_dict.rename(columns={"index": 'top_match', 0: 'similarity'})
    
    return {'_'.join([str(i) for i in k]):v for k, v in processed_dict.T.stack().to_dict().items()}


def robust_sentence_top_matches(text_list: list,
                                minimum_length: int=3,
                                n_top:int=5,
                                n_top_calc:int=10):
    
    """
    Applies the previously defined functions to return 
    the top submatches ranked by the voting classifier and
    returns them in the same format as sentence_top_matches
    """
    
    processed_sentences = subsentences_top_matches(text_list, minimum_length, n_top_calc).groupby('text').agg(list).T
    return processed_sentences.apply(lambda x: soft_voting_classifier(x, n_top)).apply(pd.Series)

## Word Embedding Limitations

As we have mentioned, one of the main limitations of the *Contextual Word Embeddings* like **Roberta** (although we are not interested in the embedding of words, but the sentence itself) is that it becomes weaker the longer the sentences are. The main reason if this tend to be two:

* Longer sentences dilute the relevance of the words it contains. It can be seen clearly with the following examples:
    * *I eat pizza*
    * *I eat pizza with my family because today is my birthday*
    
    We can easily measure the importance of the words in the first sentence(*subject*: **I**, *action*: **Eat**, *object*: **Pizza**).
    
    But it will not be as easy in the second sentence. In this case the relevant subject is *not so clear*, the one performing the actions is **We** as they all are eating, but it is due to the **birthday** of one of them, which is more important? And what about the object, we do eat **pizza**, but it is due to the **birthday**, which one is more relevant? Would this sentence be closer to "*I eat pizza*" or "*I celebrate my birthday with my family*"?
    
    
* Not only longer sentences have more **ambiguous components** but they have **more components** and the size of the encoded sentence will always be the same (around 1000 in this case, but *fine-tuned* to **52**) which means that we can arrive to a point in which we will just run out of dimensions to represent the sentence properly. It can be fixed increasing the number of dimensions of the model but then you are dangerously sensitive to **overfitting** and that is the main reason we *fine-tuned* the model, to avoid it, but then we are sensitive to this problem of running out of dimensions.
    

### Check Commitments

Even though our model shouldn't be fitted to long sentences we will check how it behaves with a small sample extracted from some commitments.

In [4]:
display(Markdown(sentence_top_matches(commitment_texts, n_top=3).sample(5, random_state=42).to_markdown()))

| level_1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | top_match_0                                                               | top_match_1                                                                                  | top_match_2                                                                         |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|---------------:|---------------:|---------------:|
| By 2050, the world’s urban population is expected to nearly double, making urbanization one of the twenty-first century’s most transformative trends. Populations, economic activities, social and cultural interactions, as well as environmental and humanitarian impacts, are increasingly concentrated in cities, and this poses massive sustainability challenges in terms of housing, infrastructure, basic services, food security, health, education, decent jobs, safety and natural resources, among others.                                                                                                       | Overcrowding                                                              | Urban sprawl                                                                                 | Percentage of population exposed to noise levels >50 dba                            |       0.540603 |       0.526874 |       0.511104 |
| We will promote planned urban extensions and infill, prioritizing renewal, regeneration and retrofitting of urban areas, as appropriate, including the upgrading of slums and informal settlements, providing high-quality buildings and public spaces, promoting integrated and participatory approaches involving all relevant stakeholders and inhabitants and avoiding spatial and socioeconomic segregation and gentrification, while preserving cultural heritage and preventing and containing urban sprawl.                                                                                                          | design features that allow for social interaction (parking, squares, etc) | Funds for the improvement of the physical urban environment                                  | Effect on area revitalization/development                                           |       0.559296 |       0.546886 |       0.522856 |
| We will include culture as a priority component of urban plans and strategies in the adoption of planning instruments, including master plans, zoning guidelines, building codes, coastal management policies and strategic development policies that safeguard a diverse range of tangible and intangible cultural heritage and landscapes, and will protect them from potential disruptive impacts of urban development.                                                                                                                                                                                                   | Conservation of specific important habitats and cultural sites            | Protecting and enhancing cultural heritage, local identity and assets                        | Health, safety and enivironmental initiatives and innovations in municipality level |       0.614092 |       0.590268 |       0.533319 |
| The New Urban Agenda acknowledges that culture and cultural diversity are sources of enrichment for humankind and provide an important contribution to the sustainable development of cities, human settlements and citizens, empowering them to play an active and unique role in development initiatives. The New Urban Agenda further recognizes that culture should be taken into account in the promotion and implementation of new sustainable consumption and production patterns that contribute to the responsible use of resources and address the adverse impact of climate change.                               | Shift in level of pride in local cultural heritage                        | New identities programmes and spaces. Alternative forms of celebrating contemporary culture. | Innovation and creativity to experience nature and cultural heritage                |       0.595359 |       0.592083 |       0.586385 |
| We commit ourselves to stimulating the supply of a variety of adequate housing options that are safe, affordable and accessible for members of different income groups of society, taking into consideration the socioeconomic and cultural integration of marginalized communities, homeless persons and those in vulnerable situations and preventing segregation. We will take positive measures to improve the living conditions of homeless people, with a view to facilitating their full participation in society, and to prevent and eliminate homelessness, as well as to combat and eliminate its criminalization. | Housing quality (area per capita), informal housing and slum reduction    | Using smart solutions to enhance accessibility to services and amenities                     | Quick response system to municipal Health, safety and enivironmental problems       |       0.630492 |       0.577983 |       0.513161 |

We can see that the top indicator similarity value declines **dramatically** in comparison of the previous notebook (median of **0.74**), due to the causes we just explained above. But even though this happens, it is still able to recognise the most important meaning of the sentences overall even if we find some ambiguous indicators (*Health, safety and enivironmental initiatives and innovations in municipality level*).

### Full vs Batch comparisons

Now we will be comparing one by one the **base model** (*Full*) vs the **soft-voting classifier** (*Batch*).

The **soft-voting classifier** will take every sentence and split it in every **coma** and **dot** and applying the model separately, computing a weighted sum of indicators (*Now there is no similarity maximum*). The most repeated (and most relevant) indicators will then be selected as the top indicators.

In [5]:
trial_list = list(sentence_top_matches(commitment_texts, n_top=3).sample(5, random_state=42).index)
robust_matched_sentences = robust_sentence_top_matches(trial_list, n_top=3)
matched_sentences = sentence_top_matches(trial_list, n_top=3)

#### Population and Urban challenges

##### Full

In [6]:
display(Markdown(matched_sentences[:1].to_markdown()))

| level_1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | top_match_0   | top_match_1   | top_match_2                                              |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------|:--------------|:---------------------------------------------------------|---------------:|---------------:|---------------:|
| By 2050, the world’s urban population is expected to nearly double, making urbanization one of the twenty-first century’s most transformative trends. Populations, economic activities, social and cultural interactions, as well as environmental and humanitarian impacts, are increasingly concentrated in cities, and this poses massive sustainability challenges in terms of housing, infrastructure, basic services, food security, health, education, decent jobs, safety and natural resources, among others. | Overcrowding  | Urban sprawl  | Percentage of population exposed to noise levels >50 dba |       0.540603 |       0.526874 |       0.511104 |

##### Batches

In [7]:
display(Markdown(robust_matched_sentences[:1].to_markdown()))

| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   | top_match_0   | top_match_1                 | top_match_2       |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------|:----------------------------|:------------------|---------------:|---------------:|---------------:|
| By 2050, the world’s urban population is expected to nearly double, making urbanization one of the twenty-first century’s most transformative trends. Populations, economic activities, social and cultural interactions, as well as environmental and humanitarian impacts, are increasingly concentrated in cities, and this poses massive sustainability challenges in terms of housing, infrastructure, basic services, food security, health, education, decent jobs, safety and natural resources, among others. | Urban sprawl  | Density of urban population | Urbanization Rate |        2.14667 |        2.05877 |        1.92701 |

##### Thoughts

We can see a slight improvement in the batch as it points at **density of urban population** (*DUP*) (*overcrowding* is a specific case in which *DUP* is very high) and **Urbanization rate**, while the Full sentence pointed to a *spurious relation* (that is not wrong, just not representative enough) of **noise levels**,

#### Urban Agenda for Cultural Diversity 

##### Full

In [8]:
display(Markdown(matched_sentences[1:2].to_markdown()))

| level_1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | top_match_0                                        | top_match_1                                                                                  | top_match_2                                                          |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------|:---------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|---------------:|---------------:|---------------:|
| The New Urban Agenda acknowledges that culture and cultural diversity are sources of enrichment for humankind and provide an important contribution to the sustainable development of cities, human settlements and citizens, empowering them to play an active and unique role in development initiatives. The New Urban Agenda further recognizes that culture should be taken into account in the promotion and implementation of new sustainable consumption and production patterns that contribute to the responsible use of resources and address the adverse impact of climate change. | Shift in level of pride in local cultural heritage | New identities programmes and spaces. Alternative forms of celebrating contemporary culture. | Innovation and creativity to experience nature and cultural heritage |       0.595359 |       0.592083 |       0.586385 |

##### Batches

In [9]:
display(Markdown(robust_matched_sentences[1:2].to_markdown()))

| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | top_match_0                                                | top_match_1                                        | top_match_2                 |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------|:---------------------------------------------------|:----------------------------|---------------:|---------------:|---------------:|
| The New Urban Agenda acknowledges that culture and cultural diversity are sources of enrichment for humankind and provide an important contribution to the sustainable development of cities, human settlements and citizens, empowering them to play an active and unique role in development initiatives. The New Urban Agenda further recognizes that culture should be taken into account in the promotion and implementation of new sustainable consumption and production patterns that contribute to the responsible use of resources and address the adverse impact of climate change. | Strong leadership promoting innovation and smart solutions | Shift in level of pride in local cultural heritage | Resident population density |        1.04755 |       0.683924 |       0.683463 |

##### Thoughts

In this case the improvement is not so clear, even we could say the base model performed better. Both of them undertood the cultural impact (*Shift in level of pride in local cultural heritage*) but the batch model **gives more importance to the action** and plan than to the **subject itself**, which is not wrong but **may not be as specific**. There is also a curious spurious relation in the batch model: **Resident population density**, it is not the same than the previous one as it changes 'Urban' for 'Resident' which makes more sense to this specific text, but as we said, not as specific as the full model.

#### Housing, Marginalized Communities & Inequality

##### Full

In [10]:
display(Markdown(matched_sentences[2:3].to_markdown()))

| level_1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | top_match_0                                                            | top_match_1                                                              | top_match_2                                                                   |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------|:-------------------------------------------------------------------------|:------------------------------------------------------------------------------|---------------:|---------------:|---------------:|
| We commit ourselves to stimulating the supply of a variety of adequate housing options that are safe, affordable and accessible for members of different income groups of society, taking into consideration the socioeconomic and cultural integration of marginalized communities, homeless persons and those in vulnerable situations and preventing segregation. We will take positive measures to improve the living conditions of homeless people, with a view to facilitating their full participation in society, and to prevent and eliminate homelessness, as well as to combat and eliminate its criminalization. | Housing quality (area per capita), informal housing and slum reduction | Using smart solutions to enhance accessibility to services and amenities | Quick response system to municipal Health, safety and enivironmental problems |       0.630492 |       0.577984 |       0.513161 |

##### Batches

In [11]:
display(Markdown(robust_matched_sentences[2:3].to_markdown()))

| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | top_match_0        | top_match_1                     | top_match_2                     |   similarity_0 |   similarity_1 |   similarity_2 |
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------|:--------------------------------|:--------------------------------|---------------:|---------------:|---------------:|
| We commit ourselves to stimulating the supply of a variety of adequate housing options that are safe, affordable and accessible for members of different income groups of society, taking into consideration the socioeconomic and cultural integration of marginalized communities, homeless persons and those in vulnerable situations and preventing segregation. We will take positive measures to improve the living conditions of homeless people, with a view to facilitating their full participation in society, and to prevent and eliminate homelessness, as well as to combat and eliminate its criminalization. | Affordable housing | Accessibility for disadvantaged | Adaptation for social inclusion |        1.64125 |         1.5291 |        1.24735 |

##### Thoughts

THis is a clear **win for the batch model**, it is way **more specific** than the full one (does not even mention poverty / inequality but for the top match that is indeed pretty good) overall, but the full model still performs very well on the first indicator.

#### Management Policies & Cultural Heritage 

##### Full

In [12]:
display(Markdown(matched_sentences[3:4].to_markdown()))

| level_1                                                                                                                                                                                                                                                                                                                                                                                                                    | top_match_0                                                    | top_match_1                                                           | top_match_2                                                                         |   similarity_0 |   similarity_1 |   similarity_2 |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------|:----------------------------------------------------------------------|:------------------------------------------------------------------------------------|---------------:|---------------:|---------------:|
| We will include culture as a priority component of urban plans and strategies in the adoption of planning instruments, including master plans, zoning guidelines, building codes, coastal management policies and strategic development policies that safeguard a diverse range of tangible and intangible cultural heritage and landscapes, and will protect them from potential disruptive impacts of urban development. | Conservation of specific important habitats and cultural sites | Protecting and enhancing cultural heritage, local identity and assets | Health, safety and enivironmental initiatives and innovations in municipality level |       0.614091 |       0.590268 |       0.533319 |

##### Batches

In [13]:
display(Markdown(robust_matched_sentences[3:4].to_markdown()))

| text                                                                                                                                                                                                                                                                                                                                                                                                                       | top_match_0    | top_match_1          | top_match_2                |   similarity_0 |   similarity_1 |   similarity_2 |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------|:---------------------|:---------------------------|---------------:|---------------:|---------------:|
| We will include culture as a priority component of urban plans and strategies in the adoption of planning instruments, including master plans, zoning guidelines, building codes, coastal management policies and strategic development policies that safeguard a diverse range of tangible and intangible cultural heritage and landscapes, and will protect them from potential disruptive impacts of urban development. | Clear strategy | Identify the culture | Management and action plan |       0.745057 |       0.738012 |       0.731689 |

##### Thoughts

It is true that this comparison is similar to the second sentence one, but in this case the performance of the full model is not so clear. It is true that it has **2 indicators considering culture** and both of them are more specific than the batch one, but the third indicator is not very related to the overall topic while *all of the indicators* proposed by the **batch model** are *on point* (not as well as the *first two of the full model*, but **better overall**) 

#### Urban Extensions for Regenaration 

##### Full

In [14]:
display(Markdown(matched_sentences[4:5].to_markdown()))

| level_1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | top_match_0                                                               | top_match_1                                                 | top_match_2                               |   similarity_0 |   similarity_1 |   similarity_2 |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------|:------------------------------------------------------------|:------------------------------------------|---------------:|---------------:|---------------:|
| We will promote planned urban extensions and infill, prioritizing renewal, regeneration and retrofitting of urban areas, as appropriate, including the upgrading of slums and informal settlements, providing high-quality buildings and public spaces, promoting integrated and participatory approaches involving all relevant stakeholders and inhabitants and avoiding spatial and socioeconomic segregation and gentrification, while preserving cultural heritage and preventing and containing urban sprawl. | design features that allow for social interaction (parking, squares, etc) | Funds for the improvement of the physical urban environment | Effect on area revitalization/development |       0.559296 |       0.546886 |       0.522856 |

##### Batches

In [15]:
display(Markdown(robust_matched_sentences[4:5].to_markdown()))

| text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | top_match_0                               | top_match_1                                                              | top_match_2                                   |   similarity_0 |   similarity_1 |   similarity_2 |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------|:-------------------------------------------------------------------------|:----------------------------------------------|---------------:|---------------:|---------------:|
| We will promote planned urban extensions and infill, prioritizing renewal, regeneration and retrofitting of urban areas, as appropriate, including the upgrading of slums and informal settlements, providing high-quality buildings and public spaces, promoting integrated and participatory approaches involving all relevant stakeholders and inhabitants and avoiding spatial and socioeconomic segregation and gentrification, while preserving cultural heritage and preventing and containing urban sprawl. | Effect on area revitalization/development | Using smart solutions to enhance accessibility to services and amenities | City participating in Mayors Adapt initiative |        1.63722 |       0.988605 |       0.949309 |

##### Thoughts

This has been by far the worst sentence in both cases as none of them is griving any credit on housing and population, but they are not that bad either. In this case it is unclear which of the models performs better.

## Conclusion

The improvement is not so clear when applying the **batch model** over the **full model**, but it will be definitely worth trying when processing whole papers.

In [16]:
#!jupyter nbconvert "Model Application" --to html_toc --TemplateExporter.exclude_input=True -TagRemovePreprocessor.remove_cell_tags="hide"

## Export Functions and Necessary Components

In [17]:
%%writefile model.py
# Base
import warnings
import numpy as np
import pandas as pd
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)

# Visualization
from IPython.display import Markdown
import ipywidgets as widgets
from plotly.graph_objs import FigureWidget
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm

#Sentence Transformes framework (HuggingFace)
from sentence_transformers import SentenceTransformer

# Cosine similarity
from sklearn.metrics.pairwise  import cosine_similarity

# NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stop_words
sp = spacy.load('en_core_web_sm')

roberta_stsb = SentenceTransformer('stsb-roberta-large')
encoded_indicators = pd.read_csv('Indicators/encoded_indicators.csv').set_index('indicator')
encoded_indicators.columns =[int(c) for c in encoded_indicators.columns]

with open('Files/fitted_pca.pickle', "rb") as file:
    pca = pickle.load(file)
    
with open('Files/commitment_texts.pickle', "rb") as file:
    commitment_texts = pickle.load(file)

def remove_stopwords(text: str):
    
    """
    Removes stopwords from a given text
    """ 
    
    return ' '.join([token.text for token in sp(text) if not token.is_stop]).strip()


def base_encoding(sentences: list):
    
    """
    Encodes a given list of sentences to match 
    the dimensions of the encoded indicators
    """
    encoded_sentences = pca.transform(roberta_stsb.encode(pd.Series(sentences).apply(remove_stopwords)))
    return pd.DataFrame(encoded_sentences, index=sentences)


def match_sentences_indicators(sentences: list):
    
    """
    Matches given sentences to encoded
    indicators in a "correlation" matrix
    """
    
    final_matrix = pd.concat([encoded_indicators, base_encoding(sentences)])
    return pd.DataFrame(cosine_similarity(final_matrix), index=final_matrix.index, columns=final_matrix.index)


def sentence_top_matches(matched_sentences: list,
                         sentences_matrix: pd.DataFrame = None,
                         valid_sentences: list = list(encoded_indicators.index),
                         n_top=5):
    
    """
    Applies the previously defined functions to return 
    the top matches of the given list of sentences
    """
    
    if not sentences_matrix:
        sentences_matrix = match_sentences_indicators(matched_sentences)
    
    matched_sentences = sentences_matrix.loc[matched_sentences][valid_sentences].drop_duplicates()
    matched_sentences = matched_sentences.unstack().sort_values(ascending=False)
    final_matched_sentences = matched_sentences[matched_sentences < 0.999].groupby(level=1).head(n_top).reset_index()
    final_matches = final_matched_sentences.groupby("level_1").agg(list)
    
    indexes = pd.DataFrame(final_matches["level_0"].to_list())
    indexes.columns = ['top_match_' + str(c) for c in indexes.columns]
    indexes.index = final_matches.index
    
    similarities = pd.DataFrame(final_matches[0].to_list())
    similarities.columns = ['similarity_' + str(c) for c in similarities.columns]  
    similarities.index = final_matches.index
    
    return pd.concat([final_matches.drop(columns=["level_0", 0]), pd.concat([indexes, similarities], axis=1)], axis=1)


def remove_subtext(text_list: list, minimum_length: int):
    
    """
    Filters strings with less words than the given number 
    (minimum_length) from a given list of strings (text_list)
    """
    
    return [text.strip() for text in text_list if len(text.strip().split(" ")) >= minimum_length]


def process_sentences(text_list: list, minimum_length: int=3):
    
    """
    Processes the strings within the given list with 
    the remove_subtext function splitting the
    original list when finding a dot or a coma
    """
    
    return pd.Series(text_list).str.replace(",", ".").str.split(".").apply(lambda x: remove_subtext(x, minimum_length))


def subsentences_top_matches(text_list: list,
                             minimum_length: int=3,
                             n_top_calc:int=10):
    
    """
    
    """
    
    text_dataframe = pd.DataFrame(pd.Series(text_list, name='text'))
    exploded_df = pd.DataFrame(process_sentences(text_list, minimum_length).explode()).reset_index().set_index(0)
    exploded_df.index.names = ['sentence']
    
    final_df = text_dataframe.merge(exploded_df, left_index=True, right_on='index').drop(columns='index')
    final_df = final_df[final_df.index.notnull()]
    
    return final_df.join(sentence_top_matches(list(final_df.index), n_top=n_top_calc))


def soft_voting_classifier(series: pd.Series,
                           n_top:int=3):
    
    """
    Ranks the highest similarity matches for a  
    given series containing all of their values
    """
    
    values = pd.DataFrame(series).apply(pd.Series.explode).reset_index()
    
    final_top_matches = {}
    
    matches = values[values["index"].str.contains('top_match')][series.name]
    similarities = values[values["index"].str.contains('similarity')][series.name]
    
    for match, similarity in zip(matches, similarities):
        if match not in final_top_matches:
            final_top_matches[match] = similarity
        else:
            final_top_matches[match] += similarity
            
    processed_dict = pd.Series(final_top_matches).sort_values(ascending=False)[:n_top].reset_index()
    processed_dict = processed_dict.rename(columns={"index": 'top_match', 0: 'similarity'})
    
    return {'_'.join([str(i) for i in k]):v for k, v in processed_dict.T.stack().to_dict().items()}


def robust_sentence_top_matches(text_list: list,
                                minimum_length: int=3,
                                n_top:int=5,
                                n_top_calc:int=10):
    
    """
    Applies the previously defined functions to return 
    the top submatches ranked by the voting classifier and
    returns them in the same format as sentence_top_matches
    """
    
    processed_sentences = subsentences_top_matches(text_list, minimum_length, n_top_calc).groupby('text').agg(list).T
    return processed_sentences.apply(lambda x: soft_voting_classifier(x, n_top)).apply(pd.Series)

Overwriting model.py
