# Sentence Similarity

**Author:** [Giuseppe Tripodi](https://www.linkedin.com/in/giuseppe-tripodi-unical/)<br>
**Date created:** 2022/11/12<br>
**Description:** Sentence Similarity between electoral program

# Setup

## Install package

In [None]:
!pip install datasets transformers
!pip install sentencepiece
!pip install sacremoses
!pip install transformers
!pip install evaluate
!pip install wandb
!pip install -U sentence-transformers

## Import Libraries

In [None]:
import json
import os
import csv
import re
import wandb
import transformers
from transformers import AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer, \
    EarlyStoppingCallback
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from sklearn import preprocessing
import numpy as np
import evaluate
from transformers.integrations import TensorBoardCallback
import transformers
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
#load dataset
from datasets import load_dataset, load_metric
#tockenizer
from transformers import AutoTokenizer
from transformers import Pipeline, TextClassificationPipeline
import numpy as np
from datasets import load_dataset, load_metric
import pandas as pd
import torch
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from os import listdir
from os.path import isfile, join
from datetime import date
import sklearn
from sklearn.manifold import TSNE
from sklearn import preprocessing
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
import nltk
import re

## Setup Weight&Biases and Variables

In [None]:
today = date.today()
today = today.strftime("%b-%d-%Y")

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
%env WANDB_PROJECT=
%env WANDB_LOG_MODEL=
%env WANDB_API_KEY=

wandb.login()

## Support Functions

In [None]:
def join_csv(input_dir, output_name):
    files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
    li = []
    indexes = []
    for file in files:
        df = pd.read_csv(join(input_dir, file), index_col=None, header=0)
        index = file[:file.index(".")]
        li.append(df)
        indexes.append(index)

    frame = pd.concat(li, axis=0, ignore_index=True)
    frame["indexes"] = indexes
    frame.set_index("indexes", inplace=True)
    
    #save the output as a csv file
    frame.to_csv(output_name)
    
    return frame

### Sentence Similarity

In [None]:
class SentenceSimilarity:
    """
    A class used to compute the similarity between two sentences
    """

    def __init__(self, sentence_transformer: str):
        """
        Parameters
        -------------
        :parm sentence_transformer: str
            sentence transformer used to compute the similarity
        """
        # initialize the sentence transformer
        self.model = SentenceTransformer(sentence_transformer)

    def similarity(self, sentences1: [], sentences2: []):
        """
        Does and return the similarity between the sentences
        :return:
        """
        # compute embedding for both texts
        embedding_text1 = self.model.encode(sentences1, convert_to_tensor=True, show_progress_bar=False)
        embedding_text2 = self.model.encode(sentences2, convert_to_tensor=True, show_progress_bar=False)

        # compute the similarity
        return util.pytorch_cos_sim(embedding_text1, embedding_text2)


In [None]:
def programs_similarity(df_programs: pd.DataFrame, ss: SentenceSimilarity, program1: str, program2: str,
                        arguments: []) -> float:
    if program1 == program2:
        return 1
    sentence_program1 = df_programs.loc[program1][arguments].values
    sentence_program2 = df_programs.loc[program2][arguments].values
    cosine_scores = ss.similarity(sentence_program1, sentence_program2)
    sum = 0
    for i in range(len(sentence_program1)):
        sum += cosine_scores[i][i].item()
    return sum / len(arguments)


def similarity_matrix(df_programs: pd.DataFrame, arguments: [], model='all-MiniLM-L6-v2') -> pd.DataFrame:
    """
    The method computes the similarity matrix between every program and return it as a dataframe.
    The similarity is computed between every program,
    but it is done by considering only the argument in the array arguments.

    if arguments = ["Lavoro", "Diritti"], it is computed the similarity between the program of every politician but only
    considering the two indicated argument. So it is done the mean between the two results.
    """
    matrix = []
    ss = SentenceSimilarity(model)
    for program in df_programs.index:
        similarity_program = []
        for program2 in df_programs.index:
            sim = programs_similarity(df_programs, ss, program, program2, arguments)
            similarity_program.append(sim)
        matrix.append(similarity_program)
    return pd.DataFrame(matrix, index=df_programs.index, columns=df_programs.index)



In [None]:
def most_common_words(text: str, top_n) -> str:
    """
    Gets a text and return an extractive summarization of the text

    code from: https://stackabuse.com/text-summarization-with-nltk-in-python/
    """
    # preprocessing
    # Removing Square Brackets and Extra Spaces
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Removing special characters and digits
    formatted_article_text = re.sub('[^a-zA-Z]', ' ', text)
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

    # converting text to sentences
    sentence_list = sent_tokenize(text, language="italian")

    # Find Weighted Frequency of Occurrence
    stopwords = nltk.corpus.stopwords.words('italian')

    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_article_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    # divide the number of occurances of all the words by the frequency of the most occurring word
    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word] / maximum_frequncy)
        
    # retrieves top_n words and return it
    most_common_words = heapq.nlargest(top_n, word_frequencies, key=word_frequencies.get)
    return most_common_words

In [None]:
text = "Piano nazionale per dotare tutti gli edifici pubblici di impianti fotovoltaici e alleanza con mondo agricolo per installazione impianti fotovoltaici ed eolici. Sostegno e promozione dell’economia circolare sui rifiuti quale modello per riutilizzare e riciclare materiali e prodotti trasformando i rifiuti indifferenziati in energia e i differenziati in materia prima. Semplificazione e incentivi strutturali per il potenziamento e l'estensione di tutti gli impianti rinnovabili nazionali. Investimenti per supportare la realizzazione di impianti per le energie rinnovabili (energia eolica, solare, idroelettrica e pelagica, geotermica e bioenergia). Sostegno e incentivi all’innovazione digitale per la tracciabilità dei rifiuti attraverso l’utilizzo dei nuovi sistemi di Blockchain. Potenziamento della semplificazione, di incentivi strutturali e crediti di imposta per le imprese che riconvertono e investono in eco innovazione e nuove tecnologie. SÌ ad una transizione ecologica e energetica giusta, basata su uno sviluppo sostenibile che tuteli l’ambiente attraverso il sostegno alla ricerca e all’innovazione tecnologica. Sì ai termovalorizzatori e agli impianti a biomassa per il recupero totale dei rifiuti indifferenziati e degli scarti agricoli e forestali a fini energetici. Promozione di una gestione produttiva e sostenibile del patrimonio forestale e arboreo urbano, incrementandolo con la piantumazione, rispetto a quanto già previsto, di 1 milione ulteriore di alberi nuovi. Semplificazione per l’installazione di impianti fotovoltaici sugli edifici privati."
most_common_words(text, 10)

In [None]:
def return_most_common_words_by_arguments(df:pd.DataFrame, arguments: [], top_n):
    ret = []
    for argument in arguments:
        print(f"Argument: {argument}")
        for pol in df.index:
            ret.append({"pol": pol, "words":most_common_words(df.loc[pol][argument], top_n)})
        dataframe = pd.DataFrame(ret).set_index("pol")
        dataframe.to_csv(f"{argument}.csv")
    return dataframe
            

## Plot Functions

In [None]:
def plot_similarity_matrix(similarity_matrix, title:str):
    """
    Plots the general similarity matrix
    """
    disp = ConfusionMatrixDisplay(confusion_matrix=similarity_matrix.to_numpy(),  display_labels=similarity_matrix.index)

    disp.plot(cmap=plt.cm.Reds)
    disp.ax_.set_title(title, fontsize=17)
    disp.ax_.tick_params(axis='x', which='major', labelsize=13)
    disp.ax_.tick_params(axis='y', which='major', labelsize=13)
    disp.figure_.set_figwidth(13)
    disp.figure_.set_figheight(10)
    plt.xticks(rotation=30)
    plt.savefig(f"{title}_{today}.png")


In [None]:
def plot_similarity_matrix_by_argument(df_programs:pd.DataFrame, arguments: [], model:str):
    """
    plots the similarity matrix separately for every argument in arguments
    """
    for argument in arguments:
        cm = similarity_matrix(df_programs, [argument] , model)
        plot_similarity_matrix(cm, f"Similarità categoria: {argument}")

In [None]:
def plot_scatter_plot(df_programs:pd.DataFrame, model:str):
    """
    compute the scatter plot of the embedding of the full program for every politician
    """
    model = SentenceTransformer(model)
    tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3, random_state = 49)
    df_programs["program"] = df_programs[df_programs.columns].apply("_".join, axis=1)
    le = preprocessing.LabelEncoder()
    title = "Embedding Scatter Plot"
    
    #create embedding for arguments
    embedding = []
    labels = []
    columns = []
    for column in df_programs.columns:
        for index in df_programs.index:
            text = df_programs.loc[index][column]
            embedding.append(model.encode(text, convert_to_tensor=False, show_progress_bar=False))
            labels.append(index)
            columns.append(column)
    #create the dataframe with the embedding
    embedding = pd.DataFrame(embedding)
    embedding = tsne.fit_transform(embedding)
    embedding = pd.DataFrame(embedding, columns=["X", "Y"])
    embedding["labels"] = labels
    embedding["category"] = columns
    
    # create map for labels
    labels = ["o","v", "1", ">","s","+","x","D","X","3", "H", "D"]
    map_labels = {}
    for column in range(len(df_programs.columns)):
        map_labels[df_programs.columns[column]] = labels[column]
                   
            
    # create map for colors
    colors = ['tab:blue','tab:orange','tab:green','tab:red','tab:purple','tab:brown']
    map_colors = {}
    for index in range(len(df_programs.index)):
         map_colors[df_programs.index[index]] = colors[index]

    N = len(df_programs.index)
    x = embedding["X"]
    y = embedding["Y"]
    colors = embedding["labels"].map(map_colors).values
    markers = embedding["category"].map(map_labels).values
    area = [300 if embedding.loc[i]["category"] != "program" else 3000 for i in embedding.index]
    fig = plt.figure(figsize=(15,10))
    plot_lines = []
    for i in range(len(embedding.index)):
        l = plt.scatter(x[i], y[i], s=area[i], c=colors[i], marker=markers[i], alpha=0.5)
        plot_lines.append([l, embedding["category"][i]])
    plot_lines = pd.DataFrame(plot_lines, columns=["lines", "category"])
    legend1 = plt.legend(plot_lines["lines"].values[0:-6:6], plot_lines["category"].values[0:-6:6],markerscale=0.5, loc='upper left', bbox_to_anchor=(1, 1))
    plt.gca().add_artist(legend1)
    plt.legend(plot_lines["lines"].values[-6:], df_programs.index ,markerscale=0.2, bbox_to_anchor=(1, 0.5), loc='upper left')
    plt.title(title, fontsize=20)
    plt.savefig(f"{title}_{today}.png")
    
    plt.show()

# Compute Sentence Similarity

In [None]:
# Define some variables
#input_dir = "/kaggle/input/electoral-programs/summ_by_concept" # sum by online information
input_dir = "/kaggle/input/electoral-programs/it/elec_prog_extr_summ_version_2_top_10" # top 10  electoral progams

df_programs = join_csv(input_dir, "programs_by_category.csv")

In [None]:
df_programs = df_programs.reindex(index = ['PD','Movimento5Stelle','AzioneItaliaviva', 'ForzaItalia',  'Lega',  'FratellidItalia'])
df_programs.index

In [None]:
arguments = df_programs.columns

In [None]:
#model = 'all-MiniLM-L6-v2' # use on the english text
model = 'efederici/sentence-bert-base' # use on the italian text
cm = similarity_matrix(df_programs, arguments, model)

## Plotting

In [None]:
plot_similarity_matrix(cm, "General Program Similarity")

### Categories Analysis

In [None]:
plot_similarity_matrix_by_argument(df_programs, ["Diritti"], model)

#### Most common words

##### Ambiente 

In [None]:
return_most_common_words_by_arguments(df_programs, ["Ambiente"], 10)

##### Diritti

In [None]:
plot_similarity_matrix_by_argument(df_programs, ["Ambiente"], model)

In [None]:
return_most_common_words_by_arguments(df_programs, ["Diritti"], 10)

In [None]:
# most used words in all the category
with open("/kaggle/input/programs/program_by_index_version_2/PD/Diritti.txt", "r") as f:
    text = f.readlines()
text = " ".join(text)
"Gay" in most_common_words(text, 10000) 

##### Esteri

In [None]:
return_most_common_words_by_arguments(df_programs, ["Esteri"], 10)

### Scatter Plot

In [None]:
plot_scatter_plot(df_programs, model)