In [1]:
import pandas as pd
from lexicalrichness import LexicalRichness
from detoxify import Detoxify
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
import os
import fasttext
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PolarizationAnalyzer():
    def __init__(self,model = 'sentence-transformers/all-mpnet-base-v2'):
        # Initialize the model
        self.model = SentenceTransformer(model, device="cuda")
        self.batch_size = 1024
        # Load the polarization terms and compute their embeddings
        self.load_and_embed_terms()

    def load_and_embed_terms(self):
        # Load terms from CSV
        filepath = "civirank/data/polarization_dictionary.csv"
        df = pd.read_csv(filepath, header=0)
        df = df[df['label'] == 'issue']
        unique_words = df['word'].unique()
        
        # Compute embeddings for the unique words
        self.dict_embeddings = self.model.encode(
            list(unique_words),
            batch_size=self.batch_size,
            show_progress_bar=False,
            convert_to_tensor=True
        )
        
        # Average the embeddings to create a single dictionary embedding
        self.dict_embeddings = torch.mean(self.dict_embeddings, dim=0)

    def preprocess(self, df):
        # Regular expressions to clean up the text data
        df["text"] = df["text"].replace(
            to_replace=[r"(?:https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})"],
            value=[""], 
            regex=True,
        )
        df["text"] = df["text"].replace(to_replace=r"&.*;", value="", regex=True)
        df["text"] = df["text"].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True) 
        df["text"] = df["text"].replace(to_replace=r"\s+", value=" ", regex=True)
        df["text"] = df["text"].replace(to_replace=r"\@\w+", value="@user", regex=True)

    def get_embeddings(self, df):
        # Encode text in batches
        corpus_embeddings = self.model.encode(
            list(df["text"]),
            batch_size=1024,
            show_progress_bar=False, 
            convert_to_tensor=True
        ) 

        assert len(corpus_embeddings) == len(df)
        return corpus_embeddings
    
    def compute_similarity(self, text_embeddings):
        # Calculate cosine similarity between text embeddings and dictionary embeddings
        cos_sim = util.cos_sim(text_embeddings, self.dict_embeddings)
        return cos_sim
    
    def get_similarity(self, texts):
        df = texts.copy()
        self.preprocess(df)
        text_embeddings = self.get_embeddings(df)
        cos_sim = self.compute_similarity(text_embeddings)
        return cos_sim.cpu().numpy()

class PolarizationAnalyzerGlove():
    def __init__(self):
        # Initialize the model
        model_path = "civirank/data/glove-model-reduced-stopwords"
        self.model = SentenceTransformer(model_path, device="cuda")
        self.batch_size = 1024
        # Load the polarization terms and compute their embeddings
        self.load_and_embed_terms()

    def load_and_embed_terms(self):
        # Load terms from CSV
        pkl_path = "civirank/data/issue_polarization_embeddings.pkl"
        with open(pkl_path, "rb") as fin:
            self.dict_embeddings = pickle.load(fin)["embeddings"].to("cuda")
            self.dict_embeddings = torch.mean(self.dict_embeddings, dim=0)

    def preprocess(self, df):
        # Regular expressions to clean up the text data
        df["text"] = df["text"].replace(
            to_replace=[r"(?:https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})"],
            value=[""], 
            regex=True,
        )
        df["text"] = df["text"].replace(to_replace=r"&.*;", value="", regex=True)
        df["text"] = df["text"].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True) 
        df["text"] = df["text"].replace(to_replace=r"\s+", value=" ", regex=True)
        df["text"] = df["text"].replace(to_replace=r"\@\w+", value="@user", regex=True)

    def get_embeddings(self, df):
        # Encode text in batches
        corpus_embeddings = self.model.encode(
            list(df["text"]),
            batch_size=1024,
            show_progress_bar=False, 
            convert_to_tensor=True
        ) 

        assert len(corpus_embeddings) == len(df)
        return corpus_embeddings
    
    def compute_similarity(self, text_embeddings):
        # Calculate cosine similarity between text embeddings and dictionary embeddings
        cos_sim = util.cos_sim(text_embeddings, self.dict_embeddings)
        return cos_sim
    
    def get_similarity(self, texts):
        df = texts.copy()
        self.preprocess(df)
        text_embeddings = self.get_embeddings(df)
        cos_sim = self.compute_similarity(text_embeddings)
        return cos_sim.cpu().numpy()

def compute_similarities(sentences_dict, analyzer):
    results = {}
    for language, texts in sentences_dict.items():
        similarities = analyzer.get_similarity(pd.DataFrame({"text": texts}))
        results[language] = similarities.flatten().tolist()
    df = pd.DataFrame(results, index=['polarized1', 'polarized2', 'unpolarized1', 'unpolarized2'])
    return df

from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

class PolarizationAnalyzerTF():
    def __init__(self, model='sentence-transformers/all-mpnet-base-v2'):
        # Initialize the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.model = AutoModel.from_pretrained(model)
        self.batch_size = 1024
        # Load the polarization terms and compute their embeddings
        self.load_and_embed_terms()

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def load_and_embed_terms(self):
        # Load terms from CSV
        filepath = "civirank/data/polarization_dictionary.csv"
        df = pd.read_csv(filepath, header=0)
        df = df[df['label'] == 'issue']
        unique_words = df['word'].unique()
        
        # Tokenize terms
        encoded_input = self.tokenizer(list(unique_words), padding=True, truncation=True, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # Perform pooling
        term_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])

        # Average the embeddings to create a single dictionary embedding
        self.dict_embeddings = torch.mean(term_embeddings, dim=0)

    def preprocess(self, df):
        # Regular expressions to clean up the text data
        df["text"] = df["text"].replace(
            to_replace=[r"(?:https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})"],
            value=[""], 
            regex=True,
        )
        df["text"] = df["text"].replace(to_replace=r"&.*;", value="", regex=True)
        df["text"] = df["text"].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True) 
        df["text"] = df["text"].replace(to_replace=r"\s+", value=" ", regex=True)
        df["text"] = df["text"].replace(to_replace=r"\@\w+", value="@user", regex=True)

    def get_embeddings(self, df):
        # Encode text in batches
        encoded_input = self.tokenizer(list(df["text"]), padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        text_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        return text_embeddings
    
    def compute_similarity(self, text_embeddings):
        # Calculate cosine similarity between text embeddings and dictionary embeddings
        cos_sim = util.cos_sim(text_embeddings, self.dict_embeddings)
        return cos_sim
    
    def get_similarity(self, texts):
        df = texts.copy()
        self.preprocess(df)
        text_embeddings = self.get_embeddings(df)
        cos_sim = self.compute_similarity(text_embeddings)
        return cos_sim.cpu().numpy()


In [34]:
filepath = "civirank/data/prosocial_dictionary.csv"
df = pd.read_csv(filepath, header=None, names = ['word'])
df.head()

Unnamed: 0,word
0,accommodate
1,admire
2,advise
3,affable
4,affection


In [3]:
# Dictionary of sentences
sentences = {
    "English": [
        "The corrupt politicians are ruining our democracy with their lies and deceit.",
        "These criminals are a threat to our peaceful society and must be stopped.",
        "The local community held a meeting to discuss the upcoming elections.",
        "People from different backgrounds came together to celebrate the festival."
    ],
    "Italian": [
        "I politici corrotti stanno rovinando la nostra democrazia con le loro bugie e inganni.",
        "Questi criminali sono una minaccia per la nostra società pacifica e devono essere fermati.",
        "La comunità locale ha tenuto una riunione per discutere le prossime elezioni.",
        "Persone di diversi background si sono riunite per celebrare il festival."
    ],
    "French": [
        "Les politiciens corrompus ruinent notre démocratie avec leurs mensonges et tromperies.",
        "Ces criminels sont une menace pour notre société paisible et doivent être arrêtés.",
        "La communauté locale a tenu une réunion pour discuter des prochaines élections.",
        "Des personnes de différents horizons se sont réunies pour célébrer le festival."
    ],
    "Russian": [
        "Коррумпированные политики разрушают нашу демократию своими ложью и обманом.",
        "Эти преступники угрожают нашему мирному обществу и должны быть остановлены.",
        "Местное сообщество провело встречу для обсуждения предстоящих выборов.",
        "Люди из разных слоев общества собрались, чтобы отпраздновать фестиваль."
    ],
    "Portuguese": [
        "Os políticos corruptos estão arruinando nossa democracia com suas mentiras e enganos.",
        "Esses criminosos são uma ameaça para nossa sociedade pacífica e devem ser detidos.",
        "A comunidade local realizou uma reunião para discutir as próximas eleições.",
        "Pessoas de diferentes origens se reuniram para celebrar o festival."
    ],
    "Spanish": [
        "Los políticos corruptos están arruinando nuestra democracia con sus mentiras y engaños.",
        "Estos criminales son una amenaza para nuestra sociedad pacífica y deben ser detenidos.",
        "La comunidad local celebró una reunión para discutir las próximas elecciones.",
        "Personas de diferentes orígenes se reunieron para celebrar el festival."
    ],
    "Turkish": [
        "Yolsuz politikacılar yalanları ve aldatmacalarıyla demokrasimizi mahvediyorlar.",
        "Bu suçlular barışçıl toplumumuza tehdit oluşturuyor ve durdurulmaları gerekiyor.",
        "Yerel topluluk, yaklaşan seçimleri tartışmak için bir toplantı yaptı.",
        "Farklı geçmişlerden insanlar festivali kutlamak için bir araya geldi."
    ]
}

In [4]:
# TF, all-mpnet-base-v2, fixed dict
V = PolarizationAnalyzerTF("sentence-transformers/all-mpnet-base-v2")
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)

Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.322196,0.260662,0.214817,0.077519,0.238514,0.258684,0.392205
polarized2,0.272224,0.12575,0.190544,0.06905,0.214022,0.205365,0.150611
unpolarized1,0.277194,0.162375,0.214758,0.061415,0.261384,0.23692,0.205943
unpolarized2,0.185894,0.071506,0.019552,0.047946,0.088326,0.125417,0.163404


In [29]:
# all-mpnet-base-v2, fixed dict
V = PolarizationAnalyzer("sentence-transformers/all-mpnet-base-v2")
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)




Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.322082,0.259304,0.214406,0.077373,0.238186,0.257892,0.390958
polarized2,0.270462,0.123833,0.188338,0.068888,0.212405,0.203302,0.151397
unpolarized1,0.276402,0.160782,0.214057,0.061325,0.260947,0.236816,0.20655
unpolarized2,0.185092,0.070614,0.019575,0.048472,0.088947,0.126057,0.163701


In [30]:
# sentence-transformers/paraphrase-multilingual-mpnet-base-v2, fixed dict
V = PolarizationAnalyzer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)



Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.306575,0.348605,0.331212,0.334709,0.338811,0.338522,0.362182
polarized2,0.416198,0.418934,0.414645,0.444557,0.419443,0.411341,0.424486
unpolarized1,0.370046,0.388773,0.392882,0.403784,0.381662,0.3773,0.40782
unpolarized2,0.183952,0.202088,0.227249,0.233623,0.21756,0.207749,0.23487


In [19]:
# all-mpnet-base-v2
V = PolarizationAnalyzer("sentence-transformers/all-mpnet-base-v2")
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)



Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.03239,0.138811,0.048786,0.093928,0.051559,0.091762,0.234064
polarized2,0.04942,0.061838,0.051024,0.127765,0.068337,0.101919,0.204548
unpolarized1,0.089062,0.14977,0.088429,0.102644,0.145071,0.212202,0.223983
unpolarized2,0.109869,0.107426,0.051646,0.057767,0.107292,0.179788,0.207831


In [22]:
# paraphrase-multilingual-mpnet-base-v2
V = PolarizationAnalyzer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)



Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.06441,0.093183,0.090366,0.088261,0.083287,0.08346,0.082133
polarized2,0.151675,0.152305,0.158747,0.166214,0.160681,0.154786,0.159406
unpolarized1,0.171224,0.177547,0.174647,0.182736,0.171821,0.168611,0.186019
unpolarized2,0.129319,0.14606,0.14504,0.137523,0.145897,0.142713,0.154925


: 

In [9]:
# Glove, csv
V = PolarizationAnalyzerGlove()
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)

Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.405083,-0.02523,-0.05853,-0.102554,-0.16712,-0.089512,0.130725
polarized2,0.456295,-0.129897,0.003199,-0.102554,-0.114428,-0.099527,0.030498
unpolarized1,0.416074,-0.096301,0.03045,-0.11401,0.014081,-0.046252,-0.047
unpolarized2,0.445963,0.085589,0.097456,0.0,-0.010606,0.056041,-0.174001


In [18]:
# Glove, pkl
V = PolarizationAnalyzerGlove()
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)

Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.612219,0.069996,-0.03013,0.046636,0.052965,0.076015,0.089972
polarized2,0.512247,0.060953,0.117587,0.046636,0.009452,0.00399,0.02571
unpolarized1,0.383135,0.095668,0.099378,0.131592,0.19584,0.102686,0.119306
unpolarized2,0.290264,0.078435,0.08784,0.0,0.075617,0.051701,0.114827


In [32]:
# glove, csv
V = PolarizationAnalyzer(model='civirank/data/glove-model-reduced-stopwords')
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)

Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.612219,0.069996,-0.03013,0.046636,0.052965,0.076015,0.089972
polarized2,0.512247,0.060953,0.117587,0.046636,0.009452,0.00399,0.02571
unpolarized1,0.383135,0.095668,0.099378,0.131592,0.19584,0.102686,0.119306
unpolarized2,0.290264,0.078435,0.08784,0.0,0.075617,0.051701,0.114827


In [12]:
from civirank import analyzers, parsers, rankers
# glove codebase, csv
V = analyzers.PolarizationAnalyzer()
similarity_df = compute_similarities(sentences, V)
similarity_df.head(10)

Unnamed: 0,English,Italian,French,Russian,Portuguese,Spanish,Turkish
polarized1,0.612219,0.069996,-0.03013,0.046636,0.052965,0.076015,0.089972
polarized2,0.512247,0.060953,0.117587,0.046636,0.009452,0.00399,0.02571
unpolarized1,0.383135,0.095668,0.099378,0.131592,0.19584,0.102686,0.119306
unpolarized2,0.290264,0.078435,0.08784,0.0,0.075617,0.051701,0.114827
