In [None]:
!pip install pandas numpy torch matplotlib hdbscan umap-learn scikit-learn sentence-transformers
!pip install bertopic==0.16.2
!pip install demoji
!pip install advertools

In [2]:
!pip install -qq accelerate
!pip install -qq datasets
!pip install -qq trl
!pip install -qq transformers
!pip install -qq bitsandbytes

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import demoji
from statistics import mean
from torch import cuda
from transformers import (AutoModelForCausalLM,
                          AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          GenerationConfig,
                          BitsAndBytesConfig,
                          pipeline,
                          logging)
from bertopic import BERTopic
from statistics import mean
import torch
import matplotlib.pyplot as plt
import hdbscan
import os
import io
import re
import string
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from umap import UMAP
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import seaborn as sns
import advertools as adv
from bertopic.representation import TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'



In [4]:
def remove_emojis(text):
    return demoji.replace(text, '')

def preprocess_text(text):
    # Remove retweets
    text = re.sub(r'^RT[\s]+', '', text, flags=re.IGNORECASE)
    # Remove usernames @
    text = re.sub(r'@[^\s]+', '', text)
    # Remove URLs
    text = re.sub(r'https\S+', '', text)
    url_words = ['url', 'URL', 'html', 'HTML', 'http', 'HTTP']
    for u in url_words:
        text = re.sub(u, '', text)
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    text = remove_emojis(text)
    # Remove extra spaces that may have been introduced
    text = re.sub(r'\s+', ' ', text).strip()
    # Return preprocessed text
    return text


In [5]:
def load_dataset(input_file, stance, topic) -> pd.DataFrame:
    '''
    Each time we only deal with a single topic and stance.
    '''
    input_corpus = pd.read_csv(input_file)
    input_corpus = input_corpus[input_corpus["topic"] == topic]

    if stance == 1:
        input_corpus = input_corpus[input_corpus["stance"] == 1]
    else:
        input_corpus = input_corpus[input_corpus["stance"] == -1]

    input_corpus.reset_index(drop=True, inplace=True)

    # Debugging: Check if DataFrame is empty after filtering
    if input_corpus.empty:
        print("No data available for the specified topic and stance.")
        return pd.DataFrame()  # Return an empty DataFrame

    # Preprocess the arguments and store them in a new column
    preprocessed_data = [preprocess_text(arg) for arg in input_corpus['argument']]

    # Debugging: Check preprocessed data
    if not preprocessed_data:
        print("Preprocessed data is empty. Check the preprocessing function.")

    # Expand the tuples into separate columns
    input_corpus['preprocessed_arguments']= preprocessed_data

    # Debug output
    print("Loaded dataset:")
    print(input_corpus.head())
    print(f"Total records: {len(input_corpus)}")

    return input_corpus


In [None]:
model_id = 'IMISLab/GreekWiki-umt5-base'

summarization_prompt = """Παρακάτω θα δεις μερικά επιχειρήματα υπέρ ή κατά για ένα συγκεκριμένο θέμα:
[DOCUMENTS]

Με βάση τα παραπάνω, γράψε μια σύντομη πρόταση που να συνοψίζει αυτά τα επιχειρήματα σε ένα keypoint, ακολουθώντας το μοτίβο:
θέμα: <keypoint>"""


tokenizer=AutoTokenizer.from_pretrained(model_id)
t5_base=model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# Create your representation model
generator = pipeline(task='text2text-generation',
                     model=t5_base,
                     tokenizer=tokenizer,
                     max_new_tokens=100,
                     length_penalty= 2.0,
                     num_beams=3,
                     early_stopping=True,
                     device="cuda")
greek_t5_base = TextGeneration(generator,prompt=summarization_prompt)


tokenizer_config.json:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.55M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/7.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/860 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

#Meltemi base

In [7]:
#!huggingface-cli login --token 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
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
!huggingface-cli login --token $secret_hf

  pid, fd = os.forkpty()


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model_id = "ilsp/Meltemi-7B-v1.5"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)
# Llama 2 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
base_model.eval()


DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
messages = [
    {"role": "system", "content": "Είσαι το Μελτέμι, ένα γλωσσικό μοντέλο για την ελληνική γλώσσα. Είσαι ιδιαίτερα βοηθητικό προς την χρήστρια ή τον χρήστη και δίνεις σύντομες αλλά επαρκώς περιεκτικές απαντήσεις. Απάντα με προσοχή, ευγένεια, αμεροληψία, ειλικρίνεια και σεβασμό προς την χρήστρια ή τον χρήστη."},
    {"role": "user", "content": summarization_prompt},
]

prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
# Our text generator
generator = pipeline(
    model=base_model,
    tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=100,
    num_beams=3,
    early_stopping=True,
    length_penalty= 2.0)
# Text generation with Zephyr
meltemi_base = TextGeneration(generator, prompt=prompt)

#Meltemi Instruct

In [None]:
model_id = "ilsp/Meltemi-7B-Instruct-v1.5"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)
# Llama 2 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
instruct_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
instruct_model.eval()

messages = [
    {"role": "system", "content": "Είσαι το Μελτέμι, ένα γλωσσικό μοντέλο για την ελληνική γλώσσα. Είσαι ιδιαίτερα βοηθητικό προς την χρήστρια ή τον χρήστη και δίνεις σύντομες αλλά επαρκώς περιεκτικές απαντήσεις. Απάντα με προσοχή, ευγένεια, αμεροληψία, ειλικρίνεια και σεβασμό προς την χρήστρια ή τον χρήστη."},
    {"role": "user", "content": summarization_prompt},
]


prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
# Our text generator
generator = pipeline(
    model=instruct_model,
    tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=100,
    num_beams=3,
    early_stopping=True,
    length_penalty= 2.0)
meltemi_instruct = TextGeneration(generator, prompt=prompt)

tokenizer_config.json:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.97M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/713 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/503M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

By default, four of the most representative documents will be passed to [DOCUMENTS]. These documents are selected by calculating their similarity (through c-TF-IDF representations) with the main c-TF-IDF representation of the topics. The four best matching documents per topic are selected.

To increase the number of documents passed to [DOCUMENTS], we can use the nr_docs parameter which is accessible in all LLMs on this page. Using this value allows you to select the top n most representative documents instead. If you have a long enough context length, then you could even give the LLM dozens of documents.

However, some of these documents might be very similar to one another and might be near duplicates. They will not provide much additional information about the content of the topic. Instead, we can use the diversity parameter in each LLM to only select documents that are sufficiently diverse. It takes values between 0 and 1 but a value of 0.1 already does wonders!

In [10]:
print(prompt)

<|system|>
Είσαι το Μελτέμι, ένα γλωσσικό μοντέλο για την ελληνική γλώσσα. Είσαι ιδιαίτερα βοηθητικό προς την χρήστρια ή τον χρήστη και δίνεις σύντομες αλλά επαρκώς περιεκτικές απαντήσεις. Απάντα με προσοχή, ευγένεια, αμεροληψία, ειλικρίνεια και σεβασμό προς την χρήστρια ή τον χρήστη.</s>
<|user|>
Γράψε μια σύντομη πρόταση ως περίληψη για το παρακάτω κείμενο:
[DOCUMENTS]

Περίληψη:</s>
<|assistant|>



In [None]:
# Topic Modeling Script for Jupyter Notebook
# --------------------------------------------------------------
n_neighbors = 10  # Set your desired value
n_components = 2  # Set your desired value
min_samples_fraction = 1.0  # Set your desired value (as a fraction)
selected_arguments_path = '/kaggle/input/meltemi-data/'
output_topic_data = './output_topic_data/'  # Output directory for topic data
output_arguments_data = './output_arguments_data/'  # Output directory for arguments data
stopwords = list(adv.stopwords['greek'])

# Set environment variable to avoid parallelism issues with tokenizers
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Set device for torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def topic_modeling(df, output_prefix, topic, stance):
    cluster_size = int(len(df) / 50)
    if cluster_size < 3:
        cluster_size = 3

    print(f"Cluster size: {cluster_size}")

    min_samples = int(min_samples_fraction * cluster_size)
    if min_samples < 2:
        min_samples = 2

    print(f"Min samples: {min_samples}")

    text_list = df['preprocessed_arguments'].tolist()
    embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", trust_remote_code=True)
    embeddings = embedding_model.encode(text_list)
    umap_model = UMAP(random_state=42, n_neighbors=n_neighbors, n_components=n_components, min_dist=0.00, metric='cosine')
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=cluster_size,
                                    metric='euclidean',
                                    cluster_selection_method='leaf',
                                    min_samples=min_samples,
                                    prediction_data=True)
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords, lowercase=True)
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    representation_model = {"Greek_t5": greek_t5_base,"Meltemi_base": meltemi_base,"Meltemi_Instruct": meltemi_instruct}
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model,
        low_memory=True,
        nr_topics=10,
        verbose=True,
        calculate_probabilities=True,
        language="multilingual"
    )

    topics, probabilities = topic_model.fit_transform(documents=text_list, embeddings=embeddings)

    # Ensure output directory exists
    os.makedirs(output_topic_data, exist_ok=True)

    # Save topic information to CSV
    topic_info = topic_model.get_topic_info()

    # Add 'topic' and 'stance' columns to the topic information
    topic_info['topic'] = topic
    topic_info['stance'] = stance

    # Save the updated topic information to CSV
    output_file = os.path.join(output_topic_data, f"{output_prefix}_topic_info.csv")
    topic_info.to_csv(output_file, index=False)
    print(f"Topic information saved to {output_file}")

# Load the main CSV file that contains topics and stances
arguments_file_path = os.path.join(selected_arguments_path, 'arguments_human_translated_dev.csv')
arguments_df = pd.read_csv(arguments_file_path)

# Get unique combinations of topic and stance
unique_combinations = arguments_df[['topic', 'stance']].drop_duplicates()

# Iterate through each unique combination of topic and stance
for _, row in unique_combinations.iterrows():
    topic = row['topic']
    stance = row['stance']

    # Load dataset based on the topic and stance
    filtered_df = load_dataset(arguments_file_path, stance, topic)

    # Perform topic modeling and save results
    topic_modeling(filtered_df, output_prefix=f"{topic}_{stance}", topic=topic, stance=stance)

In [None]:
# Define the directory where your CSV files are located
csv_directory = './output_topic_data/'  # Replace with the path to your CSV files
merged_output_file = './merged_topic_data.csv'  # Path for the merged output file

# Create an empty list to hold DataFrames
dataframes = []

# Iterate over all files in the directory
for file in os.listdir(csv_directory):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_directory, file)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv(merged_output_file, index=False)


In [None]:
def preprocess_t5(text):
    # If the input is a list, convert it to a string using only the first non-empty element
    if isinstance(text, list):
        # Join non-empty elements into a single string and strip whitespace
        non_empty_elements = [item.strip() for item in text if isinstance(item, str) and item.strip()]
        if non_empty_elements:
            return non_empty_elements[0]  # Return the first non-empty element as a string

    elif isinstance(text, str):
        # Handle the case where the text contains `\n` as a literal
        text = text.replace("\\n", "\n")

        # Split the text at the first occurrence of either a period or an actual newline
        first_part = re.split(r"[.\n]", text, 1)[0]
        text = first_part.strip() + '.' if first_part else ''

    # Final step: Check if the text starts with '['
    if isinstance(text, str) and text.startswith('['):
        # Attempt to extract the string part between single quotes
        match = re.search(r"'(.*?)'", text)
        if match:
            text = match.group(1)  # Return the matched group without quotes
        else:
            # If no single quotes found, simply return the cleaned text as a string
            text = text.strip("[]")  # Remove brackets if present

    # Remove leading and trailing single quotes, if they exist
    text = text.strip("'").strip()  # Remove any leading/trailing single quotes

    # Remove leading hyphen character if it exists
    if text.startswith('-'):
        text = text[1:].strip()  # Remove the leading hyphen and strip whitespace

    return text  # Return the cleaned text

# Apply the preprocessing function to each of the specified columns
for column in ['Greek_t5']:
    merged_df[column] = merged_df[column].apply(preprocess_t5)

In [None]:
def preprocess_meltemi_base1(text):
    # Final step: Check if the text starts with '['
    if isinstance(text, str) and text.startswith('['):
        # Remove brackets if present
        text = text.strip("[]")

    # Remove leading and trailing single quotes, if they exist
    text = text.strip("'\"")

    # Remove leading newline characters
    text = text.lstrip("\n")

    # Remove specified prefixes
    prefixes = ["summarize: ", "assist: ", "assistant: "]
    for prefix in prefixes:
        if text.lstrip().startswith(prefix):
            text = text.lstrip()[len(prefix):].strip()  # Remove the prefix and strip whitespace
            break  # Stop after the first match

    return text  # Return the cleaned text

# Apply the preprocessing function to each of the specified columns
for column in ['Meltemi_base']:
    merged_df[column] = merged_df[column].apply(preprocess_meltemi_base1)

In [None]:
def preprocess_meltemi_base2(text):
    # If the input is a list, convert it to a string using only the first non-empty element
    if isinstance(text, list):
        # Join non-empty elements into a single string and strip whitespace
        non_empty_elements = [item.strip() for item in text if isinstance(item, str) and item.strip()]
        if non_empty_elements:
            # Return the first non-empty element as a string
            return non_empty_elements[0]

    # If the input is a string, process it
    if isinstance(text, str):
        # Strip leading newline characters if present
        text = text.lstrip('\n')  # Remove any leading newline characters

        # Handle the case where the text contains `\n` as a literal
        text = text.replace("\\n", "\n").strip()  # Replace literal "\n" with actual newline and strip whitespace

        # Search for the specific prefix and extract the following text
        prefixes = ["Παρακάτω θα δεις μερικά επιχειρήματα υπέρ ή κατά για ένα συγκεκριμένο θέμα:","Γράψε μια σύντομη πρόταση ως περίληψη για το παρακάτω κείμενο:", "Τα παρακάτω επιχειρήματα συνοψίζουν ένα συγκεκριμένο θέμα:", "Το παρακάτω επιχείρημα συνοψίζει ένα συγκεκριμένο θέμα:"]
        for prefix in prefixes:
          if prefix in text:
            text = text.split(prefix, 1)[1].strip()

            # Keep only the first sentence after the prefix
            text = text.split('.')[0].strip()  # Split on '.' and keep the first part

        # Remove leading hyphen character if it exists
        if text.startswith('-'):
            text = text[1:].strip()  # Remove the leading hyphen and strip whitespace

        # Remove leading and trailing single quotes, if they exist
        #text = text.strip("'").strip()  # Remove any leading/trailing single quotes

        # Remove content after a space-hyphen pattern
        text = re.split(r'-', text)[0].strip()  # Split on '-' and keep only the content before it

    return text  # Return the cleaned text

# Apply the preprocessing function to each of the specified columns
for column in ['Meltemi_base']:
    merged_df[column] = merged_df[column].apply(preprocess_meltemi_base2)


In [None]:
def preprocess_meltemi_base3(text):
    # If the input is a list, convert it to a string using only the first non-empty element
    if isinstance(text, list):
        # Join non-empty elements into a single string and strip whitespace
        non_empty_elements = [item.strip() for item in text if isinstance(item, str) and item.strip()]
        if non_empty_elements:
            # Return the first non-empty element as a string
            return non_empty_elements[0]

    # If the input is a string, process it
    if isinstance(text, str):
        # Strip leading newline characters if present
        text = text.lstrip('\n')  # Remove any leading newline characters

        # Handle the case where the text contains `\n` as a literal
        text = text.replace("\\n", "\n").strip()  # Replace literal "\n" with actual newline and strip whitespace

        # Remove leading hyphen character if it exists
        if text.startswith('-'):
            text = text[1:].strip()  # Remove the leading hyphen and strip whitespace

        # Remove leading and trailing single quotes, if they exist
        #text = text.strip("'").strip()  # Remove any leading/trailing single quotes

        # Remove content after a space-hyphen pattern
        text = re.split(r'-', text)[0].strip()  # Split on '-' and keep only the content before it

        # Keep only the content until the first full stop
        text = text.split('.')[0].strip()  # Split on '.' and keep the first part

    return text  # Return the cleaned text



# Apply the preprocessing function to each of the specified columns
for column in ['Meltemi_base']:
    merged_df[column] = merged_df[column].apply(preprocess_meltemi_base3)

In [None]:
def preprocess_meltemi_instruct1(text):
    # Step 1: Ignore everything before the first colon
    if ':' in text:
        text = text.split(':', 1)[1]  # Keep content after the first colon

    # Step 2: Drop leading newline characters
    text = text.lstrip('\n')  # Removes any leading newlines

    # Step 3: Remove any leading numbers and spaces (e.g., '\n\n1. ')
    text = re.sub(r'^\s*\\n\\n\d+\.\s*', '', text)

    # Step 4: Replace any remaining newline characters with a space and strip leading/trailing spaces
    text = text.replace('\n', ' ').strip()

    # Step 5: If the text ends with a full stop, keep content only until the full stop
    if text.endswith('.'):
        text = text.split('.')[0] + '.'  # Keep content up to and including the first full stop

    return text

def preprocess_dataframe(df, column_name):
    # Apply the preprocess_text function to the specified column
    df[column_name] = df[column_name].apply(preprocess_meltemi_instruct1)
    return df

# Example usage
processed_df = preprocess_dataframe(merged_df, 'Meltemi_Instruct')

In [None]:
def preprocess_meltemi_instruct2(text):
    # If the input is a list, convert it to a string using only the first non-empty element
    if isinstance(text, list):
        # Join non-empty elements into a single string and strip whitespace
        non_empty_elements = [item.strip() for item in text if isinstance(item, str) and item.strip()]
        if non_empty_elements:
            return non_empty_elements[0]  # Return the first non-empty element as a string

    elif isinstance(text, str):
        # Handle the case where the text contains `\n` as a literal
        text = text.replace("\\n", "\n")

        # Remove the pattern: two newlines, followed by "1. "
        text = re.sub(r'^\n\n\.\s*', '', text)

        # Split the text at the first occurrence of either a period or an actual newline
        first_part = re.split(r"[.\n]", text, 1)[0]
        text = first_part.strip() + '.' if first_part else ''

    # Final step: Check if the text starts with '['
    if isinstance(text, str) and text.startswith('['):
        # Attempt to extract the string part between single quotes
        match = re.search(r"'(.*?)'", text)
        if match:
            text = match.group(1)  # Return the matched group without quotes
        else:
            # If no single quotes found, simply return the cleaned text as a string
            text = text.strip("[]")  # Remove brackets if present

    # Remove leading and trailing single quotes, if they exist
    text = text.strip("'").strip()  # Remove any leading/trailing single quotes

    # Remove leading hyphen character if it exists
    if text.startswith('-'):
        text = text[1:].strip()  # Remove the leading hyphen and strip whitespace

    return text  # Return the cleaned text


# Apply the preprocessing function to each of the specified columns
for column in ['Meltemi_Instruct']:
    processed_df[column] = processed_df[column].apply(preprocess_meltemi_instruct2)

In [None]:
processed_df['Meltemi_base'].replace("', '', '', '', '', '', '', '', '',", np.nan, inplace=True)
processed_df['Meltemi_base'].replace('', np.nan, inplace=True)

# Drop rows where 'Meltemi instruct' is NaN
processed_df = processed_df.dropna(subset=['Meltemi_base'])
len(processed_df)

# Similarity based merging of the generated keypoints

In [None]:
def get_embeddings(texts):
    model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", trust_remote_code=True)
    """Get embeddings for a list of texts."""
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings

def merge_similar_rows(df, column):
    """Merge similar rows in a specified column of the DataFrame, retaining more important rows (higher in order)."""
    # Reset index to make sure indices are contiguous within the group
    df = df.reset_index(drop=True)

    # Generate embeddings and calculate the similarity matrix
    embeddings = get_embeddings(df[column].tolist())
    similarity_matrix = cosine_similarity(embeddings.cpu().numpy())
    print(similarity_matrix)
    # Create a set to keep track of rows to drop
    to_drop = set()
    dropped_rows = []  # List to store information of dropped rows

    for i in range(len(similarity_matrix)):
        for j in range(i + 1, len(similarity_matrix)):
            if similarity_matrix[i][j] > 0.9:  # If similarity is greater than 0.9
                to_drop.add(j)  # Drop the row with the higher index (j)

    # Collect the actual rows that will be dropped
    dropped_rows = df.iloc[list(to_drop)].copy()

    # Drop rows based on the modified index within the group
    cleaned_df = df.drop(index=list(to_drop))

    # Return both the cleaned DataFrame and the dropped rows
    return cleaned_df, dropped_rows

def process_by_topic_and_stance(df, text_columns):
    """Group by 'topic' and 'stance' and merge similar rows for specified text columns."""
    result_df = pd.DataFrame()
    all_dropped_rows = pd.DataFrame()  # DataFrame to store all dropped rows

    # Group the DataFrame by 'topic' and 'stance'
    grouped = df.groupby(['topic', 'stance'])

    # Process each group independently
    for (topic, stance), group in grouped:
        merged_group = group.copy()
        group_dropped_rows = pd.DataFrame()  # To collect dropped rows for this group

        for column in text_columns:
            merged_group, dropped_rows = merge_similar_rows(merged_group, column)
            group_dropped_rows = pd.concat([group_dropped_rows, dropped_rows], ignore_index=True)

        result_df = pd.concat([result_df, merged_group], ignore_index=True)
        all_dropped_rows = pd.concat([all_dropped_rows, group_dropped_rows], ignore_index=True)

    return result_df, all_dropped_rows

# Specify the text columns that need similarity checks
text_columns = ["Meltemi_Instruct_16shot"]

# Apply the merging function for each topic-stance group and collect dropped rows
processed_df, dropped_rows_df = process_by_topic_and_stance(processed_df, text_columns)

# Reset index for the final DataFrames
processed_df.reset_index(drop=True, inplace=True)
dropped_rows_df.reset_index(drop=True, inplace=True)

# Text Preprocessing for Rouge computation (stemming)

In [None]:
__cases = dict()
__cases["ΦΑΓΙΑ"] = "ΦΑ"
__cases["ΦΑΓΙΟΥ"] = "ΦΑ"
__cases["ΦΑΓΙΩΝ"] = "ΦΑ"
__cases["ΣΚΑΓΙΑ"] = "ΣΚΑ"
__cases["ΣΚΑΓΙΟΥ"] = "ΣΚΑ"
__cases["ΣΚΑΓΙΩΝ"] = "ΣΚΑ"
__cases["ΟΛΟΓΙΟΥ"] = "ΟΛΟ"
__cases["ΟΛΟΓΙΑ"] = "ΟΛΟ"
__cases["ΟΛΟΓΙΩΝ"] = "ΟΛΟ"
__cases["ΣΟΓΙΟΥ"] = "ΣΟ"
__cases["ΣΟΓΙΑ"] = "ΣΟ"
__cases["ΣΟΓΙΩΝ"] = "ΣΟ"
__cases["ΤΑΤΟΓΙΑ"] = "ΤΑΤΟ"
__cases["ΤΑΤΟΓΙΟΥ"] = "ΤΑΤΟ"
__cases["ΤΑΤΟΓΙΩΝ"] = "ΤΑΤΟ"
__cases["ΚΡΕΑΣ"] = "ΚΡΕ"
__cases["ΚΡΕΑΤΟΣ"] = "ΚΡΕ"
__cases["ΚΡΕΑΤΑ"] = "ΚΡΕ"
__cases["ΚΡΕΑΤΩΝ"] = "ΚΡΕ"
__cases["ΠΕΡΑΣ"] = "ΠΕΡ"
__cases["ΠΕΡΑΤΟΣ"] = "ΠΕΡ"
__cases["ΠΕΡΑΤΑ"] = "ΠΕΡ"
__cases["ΠΕΡΑΤΩΝ"] = "ΠΕΡ"
__cases["ΤΕΡΑΣ"] = "ΤΕΡ"
__cases["ΤΕΡΑΤΟΣ"] = "ΤΕΡ"
__cases["ΤΕΡΑΤΑ"] = "ΤΕΡ"
__cases["ΤΕΡΑΤΩΝ"] = "ΤΕΡ"
__cases["ΦΩΣ"] = "ΦΩ"
__cases["ΦΩΤΟΣ"] = "ΦΩ"
__cases["ΦΩΤΑ"] = "ΦΩ"
__cases["ΦΩΤΩΝ"] = "ΦΩ"
__cases["ΚΑΘΕΣΤΩΣ"] = "ΚΑΘΕΣΤ"
__cases["ΚΑΘΕΣΤΩΤΟΣ"] = "ΚΑΘΕΣΤ"
__cases["ΚΑΘΕΣΤΩΤΑ"] = "ΚΑΘΕΣΤ"
__cases["ΚΑΘΕΣΤΩΤΩΝ"] = "ΚΑΘΕΣΤ"
__cases["ΓΕΓΟΝΟΣ"] = "ΓΕΓΟΝ"
__cases["ΓΕΓΟΝΟΤΟΣ"] = "ΓΕΓΟΝ"
__cases["ΓΕΓΟΝΟΤΑ"] = "ΓΕΓΟΝ"
__cases["ΓΕΓΟΝΟΤΩΝ"] = "ΓΕΓΟΝ"
__vowels = "[ΑΕΗΙΟΥΩ]"
__refinedVowels = "[ΑΕΗΙΟΩ]"


def stemWord(w: str, exceptions: dict = None):
    stem = None
    suffix = None
    test1 = True

    if exceptions is not None and w in exceptions.keys():
        return exceptions[w]

    if len(w) < 4:
        return w

    pattern = None
    pattern2 = None
    pattern3 = None
    pattern4 = None

    # Step1
    pattern = re.compile(
        r"(.*)(ΦΑΓΙΑ|ΦΑΓΙΟΥ|ΦΑΓΙΩΝ|ΣΚΑΓΙΑ|ΣΚΑΓΙΟΥ|ΣΚΑΓΙΩΝ|ΟΛΟΓΙΟΥ|ΟΛΟΓΙΑ|ΟΛΟΓΙΩΝ|ΣΟΓΙΟΥ|ΣΟΓΙΑ|ΣΟΓΙΩΝ|ΤΑΤΟΓΙΑ|ΤΑΤΟΓΙΟΥ|ΤΑΤΟΓΙΩΝ|ΚΡΕΑΣ|ΚΡΕΑΤΟΣ|ΚΡΕΑΤΑ|ΚΡΕΑΤΩΝ|ΠΕΡΑΣ|ΠΕΡΑΤΟΣ|ΠΕΡΑΤΑ|ΠΕΡΑΤΩΝ|ΤΕΡΑΣ|ΤΕΡΑΤΟΣ|ΤΕΡΑΤΑ|ΤΕΡΑΤΩΝ|ΦΩΣ|ΦΩΤΟΣ|ΦΩΤΑ|ΦΩΤΩΝ|ΚΑΘΕΣΤΩΣ|ΚΑΘΕΣΤΩΤΟΣ|ΚΑΘΕΣΤΩΤΑ|ΚΑΘΕΣΤΩΤΩΝ|ΓΕΓΟΝΟΣ|ΓΕΓΟΝΟΤΟΣ|ΓΕΓΟΝΟΤΑ|ΓΕΓΟΝΟΤΩΝ)$")

    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        suffix = fp[1]
        w = stem + __cases[suffix]
        test1 = False

    # Step 2a
    pattern = re.compile(r"^(.+?)(ΑΔΕΣ|ΑΔΩΝ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        reg1 = re.compile(r"(ΟΚ|ΜΑΜ|ΜΑΝ|ΜΠΑΜΠ|ΠΑΤΕΡ|ΓΙΑΓΙ|ΝΤΑΝΤ|ΚΥΡ|ΘΕΙ|ΠΕΘΕΡ)$")

        if not reg1.match(w):
            w = w + "ΑΔ"

    # Step 2b
    pattern2 = re.compile(r"^(.+?)(ΕΔΕΣ|ΕΔΩΝ)$")
    if pattern2.match(w):
        fp = pattern2.match(w).groups()
        stem = fp[0]
        w = stem
        except2 = re.compile(r"(ΟΠ|ΙΠ|ΕΜΠ|ΥΠ|ΓΗΠ|ΔΑΠ|ΚΡΑΣΠ|ΜΙΛ)$")
        if except2.match(w):
            w = w + "ΕΔ"

    # Step 2c
    pattern3 = re.compile(r"^(.+?)(ΟΥΔΕΣ|ΟΥΔΩΝ)$")
    if pattern3.match(w):
        fp = pattern3.match(w).groups()
        stem = fp[0]
        w = stem
        except3 = re.compile(r"(ΑΡΚ|ΚΑΛΙΑΚ|ΠΕΤΑΛ|ΛΙΧ|ΠΛΕΞ|ΣΚ|Σ|ΦΛ|ΦΡ|ΒΕΛ|ΛΟΥΛ|ΧΝ|ΣΠ|ΤΡΑΓ|ΦΕ)$")
        if except3.match(w):
            w = w + "ΟΥΔ"

    # Step 2d
    pattern4 = re.compile("^(.+?)(ΕΩΣ|ΕΩΝ)$")
    if pattern4.match(w):
        fp = pattern4.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except4 = re.compile(r"^(Θ|Δ|ΕΛ|ΓΑΛ|Ν|Π|ΙΔ|ΠΑΡ)$")
        if except4.match(w):
            w = w + "Ε"

    # Step 3
    pattern = re.compile(r"^(.+?)(ΙΑ|ΙΟΥ|ΙΩΝ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        pattern2 = re.compile(__vowels + "$")
        test1 = False
        if pattern2.match(w):
            w = stem + "Ι"

    # Step 4
    pattern = re.compile(r"^(.+?)(ΙΚΑ|ΙΚΟ|ΙΚΟΥ|ΙΚΩΝ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        pattern2 = re.compile(__vowels + "$")
        except5 = re.compile(
            r"^(ΑΛ|ΑΔ|ΕΝΔ|ΑΜΑΝ|ΑΜΜΟΧΑΛ|ΗΘ|ΑΝΗΘ|ΑΝΤΙΔ|ΦΥΣ|ΒΡΩΜ|ΓΕΡ|ΕΞΩΔ|ΚΑΛΠ|ΚΑΛΛΙΝ|ΚΑΤΑΔ|ΜΟΥΛ|ΜΠΑΝ|ΜΠΑΓΙΑΤ|ΜΠΟΛ|ΜΠΟΣ|ΝΙΤ|ΞΙΚ|ΣΥΝΟΜΗΛ|ΠΕΤΣ|ΠΙΤΣ|ΠΙΚΑΝΤ|ΠΛΙΑΤΣ|ΠΟΣΤΕΛΝ|ΠΡΩΤΟΔ|ΣΕΡΤ|ΣΥΝΑΔ|ΤΣΑΜ|ΥΠΟΔ|ΦΙΛΟΝ|ΦΥΛΟΔ|ΧΑΣ)$")
        if except5.match(w) or pattern2.match(w):
            w = w + "ΙΚ"

    # step 5a
    pattern = re.compile(r"^(.+?)(ΑΜΕ)$")
    pattern2 = re.compile(r"^(.+?)(ΑΓΑΜΕ|ΗΣΑΜΕ|ΟΥΣΑΜΕ|ΗΚΑΜΕ|ΗΘΗΚΑΜΕ)$")
    if w == "ΑΓΑΜΕ":
        w = "ΑΓΑΜ"

    if pattern2.match(w):
        fp = pattern2.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False

    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except6 = re.compile(r"^(ΑΝΑΠ|ΑΠΟΘ|ΑΠΟΚ|ΑΠΟΣΤ|ΒΟΥΒ|ΞΕΘ|ΟΥΛ|ΠΕΘ|ΠΙΚΡ|ΠΟΤ|ΣΙΧ|Χ)$")
        if except6.match(w):
            w = w + "ΑΜ"

    # Step 5b
    pattern2 = re.compile(r"^(.+?)(ΑΝΕ)$")
    pattern3 = re.compile(r"^(.+?)(ΑΓΑΝΕ|ΗΣΑΝΕ|ΟΥΣΑΝΕ|ΙΟΝΤΑΝΕ|ΙΟΤΑΝΕ|ΙΟΥΝΤΑΝΕ|ΟΝΤΑΝΕ|ΟΤΑΝΕ|ΟΥΝΤΑΝΕ|ΗΚΑΝΕ|ΗΘΗΚΑΝΕ)$")
    if pattern3.match(w):
        fp = pattern3.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        pattern3 = re.compile(r"^(ΤΡ|ΤΣ)$")
        if pattern3.match(w):
            w = w + "ΑΓΑΝ"

    if pattern2.match(w):
        fp = pattern2.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        pattern2 = re.compile(__refinedVowels + "$")
        except7 = re.compile(
            r"^(ΒΕΤΕΡ|ΒΟΥΛΚ|ΒΡΑΧΜ|Γ|ΔΡΑΔΟΥΜ|Θ|ΚΑΛΠΟΥΖ|ΚΑΣΤΕΛ|ΚΟΡΜΟΡ|ΛΑΟΠΛ|ΜΩΑΜΕΘ|Μ|ΜΟΥΣΟΥΛΜ|Ν|ΟΥΛ|Π|ΠΕΛΕΚ|ΠΛ|ΠΟΛΙΣ|ΠΟΡΤΟΛ|ΣΑΡΑΚΑΤΣ|ΣΟΥΛΤ|ΤΣΑΡΛΑΤ|ΟΡΦ|ΤΣΙΓΓ|ΤΣΟΠ|ΦΩΤΟΣΤΕΦ|Χ|ΨΥΧΟΠΛ|ΑΓ|ΟΡΦ|ΓΑΛ|ΓΕΡ|ΔΕΚ|ΔΙΠΛ|ΑΜΕΡΙΚΑΝ|ΟΥΡ|ΠΙΘ|ΠΟΥΡΙΤ|Σ|ΖΩΝΤ|ΙΚ|ΚΑΣΤ|ΚΟΠ|ΛΙΧ|ΛΟΥΘΗΡ|ΜΑΙΝΤ|ΜΕΛ|ΣΙΓ|ΣΠ|ΣΤΕΓ|ΤΡΑΓ|ΤΣΑΓ|Φ|ΕΡ|ΑΔΑΠ|ΑΘΙΓΓ|ΑΜΗΧ|ΑΝΙΚ|ΑΝΟΡΓ|ΑΠΗΓ|ΑΠΙΘ|ΑΤΣΙΓΓ|ΒΑΣ|ΒΑΣΚ|ΒΑΘΥΓΑΛ|ΒΙΟΜΗΧ|ΒΡΑΧΥΚ|ΔΙΑΤ|ΔΙΑΦ|ΕΝΟΡΓ|ΘΥΣ|ΚΑΠΝΟΒΙΟΜΗΧ|ΚΑΤΑΓΑΛ|ΚΛΙΒ|ΚΟΙΛΑΡΦ|ΛΙΒ|ΜΕΓΛΟΒΙΟΜΗΧ|ΜΙΚΡΟΒΙΟΜΗΧ|ΝΤΑΒ|ΞΗΡΟΚΛΙΒ|ΟΛΙΓΟΔΑΜ|ΟΛΟΓΑΛ|ΠΕΝΤΑΡΦ|ΠΕΡΗΦ|ΠΕΡΙΤΡ|ΠΛΑΤ|ΠΟΛΥΔΑΠ|ΠΟΛΥΜΗΧ|ΣΤΕΦ|ΤΑΒ|ΤΕΤ|ΥΠΕΡΗΦ|ΥΠΟΚΟΠ|ΧΑΜΗΛΟΔΑΠ|ΨΗΛΟΤΑΒ)$")
        if (pattern2.match(w)) or (except7.match(w)):
            w = w + "ΑΝ"

    # //Step 5c
    pattern3 = re.compile(r"^(.+?)(ΕΤΕ)$")
    pattern4 = re.compile(r"^(.+?)(ΗΣΕΤΕ)$")
    if pattern4.match(w):
        fp = pattern4.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False

    if pattern3.match(w):
        fp = pattern3.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        pattern3 = re.compile(__refinedVowels + "$")
        except8 = re.compile(
            r"(ΟΔ|ΑΙΡ|ΦΟΡ|ΤΑΘ|ΔΙΑΘ|ΣΧ|ΕΝΔ|ΕΥΡ|ΤΙΘ|ΥΠΕΡΘ|ΡΑΘ|ΕΝΘ|ΡΟΘ|ΣΘ|ΠΥΡ|ΑΙΝ|ΣΥΝΔ|ΣΥΝ|ΣΥΝΘ|ΧΩΡ|ΠΟΝ|ΒΡ|ΚΑΘ|ΕΥΘ|ΕΚΘ|ΝΕΤ|ΡΟΝ|ΑΡΚ|ΒΑΡ|ΒΟΛ|ΩΦΕΛ)$")
        except9 = re.compile(
            r"^(ΑΒΑΡ|ΒΕΝ|ΕΝΑΡ|ΑΒΡ|ΑΔ|ΑΘ|ΑΝ|ΑΠΛ|ΒΑΡΟΝ|ΝΤΡ|ΣΚ|ΚΟΠ|ΜΠΟΡ|ΝΙΦ|ΠΑΓ|ΠΑΡΑΚΑΛ|ΣΕΡΠ|ΣΚΕΛ|ΣΥΡΦ|ΤΟΚ|Υ|Δ|ΕΜ|ΘΑΡΡ|Θ)$")
        if (pattern3.match(w)) or (except8.match(w)) or (except9.match(w)):
            w = w + "ΕΤ"

    # Step 5d
    pattern = re.compile(r"^(.+?)(ΟΝΤΑΣ|ΩΝΤΑΣ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except10 = re.compile(r"^(ΑΡΧ)$")
        except11 = re.compile(r"(ΚΡΕ)$")
        if except10.match(w):
            w = w + "ΟΝΤ"
        if except11.match(w):
            w = w + "ΩΝΤ"

    # Step 5e
    pattern = re.compile(r"^(.+?)(ΟΜΑΣΤΕ|ΙΟΜΑΣΤΕ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except11 = re.compile("^(ΟΝ)$")
        if except11.match(w):
            w = w + "ΟΜΑΣΤ"

    # Step 5f
    pattern = re.compile(r"^(.+?)(ΕΣΤΕ)$")
    pattern2 = re.compile(r"^(.+?)(ΙΕΣΤΕ)$")
    if pattern2.match(w):
        fp = pattern2.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        pattern2 = re.compile(r"^(Π|ΑΠ|ΣΥΜΠ|ΑΣΥΜΠ|ΑΚΑΤΑΠ|ΑΜΕΤΑΜΦ)$")
        if pattern2.match(w):
            w = w + "ΙΕΣΤ"

    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except12 = re.compile(r"^(ΑΛ|ΑΡ|ΕΚΤΕΛ|Ζ|Μ|Ξ|ΠΑΡΑΚΑΛ|ΑΡ|ΠΡΟ|ΝΙΣ)$")
        if except12.match(w):
            w = w + "ΕΣΤ"

    # Step 5g
    pattern = re.compile(r"^(.+?)(ΗΚΑ|ΗΚΕΣ|ΗΚΕ)$")
    pattern2 = re.compile(r"^(.+?)(ΗΘΗΚΑ|ΗΘΗΚΕΣ|ΗΘΗΚΕ)$")
    if pattern2.match(w):
        fp = pattern2.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False

    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except13 = re.compile(r"(ΣΚΩΛ|ΣΚΟΥΛ|ΝΑΡΘ|ΣΦ|ΟΘ|ΠΙΘ)$")
        except14 = re.compile(r"^(ΔΙΑΘ|Θ|ΠΑΡΑΚΑΤΑΘ|ΠΡΟΣΘ|ΣΥΝΘ|)$")
        if (except13.match(w)) or (except14.match(w)):
            w = w + "ΗΚ"

    # Step 5h
    pattern = re.compile(r"^(.+?)(ΟΥΣΑ|ΟΥΣΕΣ|ΟΥΣΕ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except15 = re.compile(
            r"^(ΦΑΡΜΑΚ|ΧΑΔ|ΑΓΚ|ΑΝΑΡΡ|ΒΡΟΜ|ΕΚΛΙΠ|ΛΑΜΠΙΔ|ΛΕΧ|Μ|ΠΑΤ|Ρ|Λ|ΜΕΔ|ΜΕΣΑΖ|ΥΠΟΤΕΙΝ|ΑΜ|ΑΙΘ|ΑΝΗΚ|ΔΕΣΠΟΖ|ΕΝΔΙΑΦΕΡ|ΔΕ|ΔΕΥΤΕΡΕΥ|ΚΑΘΑΡΕΥ|ΠΛΕ|ΤΣΑ)$")
        except16 = re.compile(r"(ΠΟΔΑΡ|ΒΛΕΠ|ΠΑΝΤΑΧ|ΦΡΥΔ|ΜΑΝΤΙΛ|ΜΑΛΛ|ΚΥΜΑΤ|ΛΑΧ|ΛΗΓ|ΦΑΓ|ΟΜ|ΠΡΩΤ)$")
        if (except15.match(w)) or (except16.match(w)):
            w = w + "ΟΥΣ"

    # Step 5i
    pattern = re.compile(r"^(.+?)(ΑΓΑ|ΑΓΕΣ|ΑΓΕ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except17 = re.compile(r"^(ΨΟΦ|ΝΑΥΛΟΧ)$")
        except20 = re.compile(r"(ΚΟΛΛ)$")
        except18 = re.compile(
            r"^(ΑΒΑΣΤ|ΠΟΛΥΦ|ΑΔΗΦ|ΠΑΜΦ|Ρ|ΑΣΠ|ΑΦ|ΑΜΑΛ|ΑΜΑΛΛΙ|ΑΝΥΣΤ|ΑΠΕΡ|ΑΣΠΑΡ|ΑΧΑΡ|ΔΕΡΒΕΝ|ΔΡΟΣΟΠ|ΞΕΦ|ΝΕΟΠ|ΝΟΜΟΤ|ΟΛΟΠ|ΟΜΟΤ|ΠΡΟΣΤ|ΠΡΟΣΩΠΟΠ|ΣΥΜΠ|ΣΥΝΤ|Τ|ΥΠΟΤ|ΧΑΡ|ΑΕΙΠ|ΑΙΜΟΣΤ|ΑΝΥΠ|ΑΠΟΤ|ΑΡΤΙΠ|ΔΙΑΤ|ΕΝ|ΕΠΙΤ|ΚΡΟΚΑΛΟΠ|ΣΙΔΗΡΟΠ|Λ|ΝΑΥ|ΟΥΛΑΜ|ΟΥΡ|Π|ΤΡ|Μ)$")
        except19 = re.compile(r"(ΟΦ|ΠΕΛ|ΧΟΡΤ|ΛΛ|ΣΦ|ΡΠ|ΦΡ|ΠΡ|ΛΟΧ|ΣΜΗΝ)$")
        if (except18.match(w) and except19.match(w)) and not ((except17.match(w)) or (except20.match(w))):
            w = w + "ΑΓ"

    # Step 5j
    pattern = re.compile("^(.+?)(ΗΣΕ|ΗΣΟΥ|ΗΣΑ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except21 = re.compile(r"^(Ν|ΧΕΡΣΟΝ|ΔΩΔΕΚΑΝ|ΕΡΗΜΟΝ|ΜΕΓΑΛΟΝ|ΕΠΤΑΝ)$")
        if except21.match(w):
            w = w + "ΗΣ"

    # Step 5k
    pattern = re.compile(r"^(.+?)(ΗΣΤΕ)$")

    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except22 = re.compile(r"^(ΑΣΒ|ΣΒ|ΑΧΡ|ΧΡ|ΑΠΛ|ΑΕΙΜΝ|ΔΥΣΧΡ|ΕΥΧΡ|ΚΟΙΝΟΧΡ|ΠΑΛΙΜΨ)$")
        if except22.match(w):
            w = w + "ΗΣΤ"

    # Step 5l
    pattern = re.compile("^(.+?)(ΟΥΝΕ|ΗΣΟΥΝΕ|ΗΘΟΥΝΕ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except23 = re.compile("^(Ν|Ρ|ΣΠΙ|ΣΤΡΑΒΟΜΟΥΤΣ|ΚΑΚΟΜΟΥΤΣ|ΕΞΩΝ)$")
        if except23.match(w):
            w = w + "ΟΥΝ"

    # Step 5l
    pattern = re.compile(r"^(.+?)(ΟΥΜΕ|ΗΣΟΥΜΕ|ΗΘΟΥΜΕ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem
        test1 = False
        except24 = re.compile(r"^(ΠΑΡΑΣΟΥΣ|Φ|Χ|ΩΡΙΟΠΛ|ΑΖ|ΑΛΛΟΣΟΥΣ|ΑΣΟΥΣ)$")
        if except24.match(w):
            w = w + "ΟΥΜ"

    # Step 6
    pattern = re.compile(r"^(.+?)(ΜΑΤΑ|ΜΑΤΩΝ|ΜΑΤΟΣ)$")
    pattern2 = re.compile(
        r"^(.+?)(Α|ΑΓΑΤΕ|ΑΓΑΝ|ΑΕΙ|ΑΜΑΙ|ΑΝ|ΑΣ|ΑΣΑΙ|ΑΤΑΙ|ΑΩ|Ε|ΕΙ|ΕΙΣ|ΕΙΤΕ|ΕΣΑΙ|ΕΣ|ΕΤΑΙ|Ι|ΙΕΜΑΙ|ΙΕΜΑΣΤΕ|ΙΕΤΑΙ|ΙΕΣΑΙ|ΙΕΣΑΣΤΕ|ΙΟΜΑΣΤΑΝ|ΙΟΜΟΥΝ|ΙΟΜΟΥΝΑ|ΙΟΝΤΑΝ|ΙΟΝΤΟΥΣΑΝ|ΙΟΣΑΣΤΑΝ|ΙΟΣΑΣΤΕ|ΙΟΣΟΥΝ|ΙΟΣΟΥΝΑ|ΙΟΤΑΝ|ΙΟΥΜΑ|ΙΟΥΜΑΣΤΕ|ΙΟΥΝΤΑΙ|ΙΟΥΝΤΑΝ|Η|ΗΔΕΣ|ΗΔΩΝ|ΗΘΕΙ|ΗΘΕΙΣ|ΗΘΕΙΤΕ|ΗΘΗΚΑΤΕ|ΗΘΗΚΑΝ|ΗΘΟΥΝ|ΗΘΩ|ΗΚΑΤΕ|ΗΚΑΝ|ΗΣ|ΗΣΑΝ|ΗΣΑΤΕ|ΗΣΕΙ|ΗΣΕΣ|ΗΣΟΥΝ|ΗΣΩ|Ο|ΟΙ|ΟΜΑΙ|ΟΜΑΣΤΑΝ|ΟΜΟΥΝ|ΟΜΟΥΝΑ|ΟΝΤΑΙ|ΟΝΤΑΝ|ΟΝΤΟΥΣΑΝ|ΟΣ|ΟΣΑΣΤΑΝ|ΟΣΑΣΤΕ|ΟΣΟΥΝ|ΟΣΟΥΝΑ|ΟΤΑΝ|ΟΥ|ΟΥΜΑΙ|ΟΥΜΑΣΤΕ|ΟΥΝ|ΟΥΝΤΑΙ|ΟΥΝΤΑΝ|ΟΥΣ|ΟΥΣΑΝ|ΟΥΣΑΤΕ|Υ|ΥΣ|Ω|ΩΝ)$")

    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem + "ΜΑ"

    if pattern2.match(w) and test1:
        fp = pattern2.match(w).groups()
        stem = fp[0]
        w = stem

    # Step 7 (ΠΑΡΑΘΕΤΙΚΑ)
    pattern = re.compile(r"^(.+?)(ΕΣΤΕΡ|ΕΣΤΑΤ|ΟΤΕΡ|ΟΤΑΤ|ΥΤΕΡ|ΥΤΑΤ|ΩΤΕΡ|ΩΤΑΤ)$")
    if pattern.match(w):
        fp = pattern.match(w).groups()
        stem = fp[0]
        w = stem

    return w

# Notes:
# (1) Greek word to get stemmed should be in capital letters and without accents. E.g. ΨΩΜΙ and not Ψωμί
# (2) Any word of length 1 - 3 will not get stemmed.
# (3) The exceptions argument is a dictionary where a key is a word and a value is a custom stem of your liking

In [None]:
def replaceMultiple(main, replacements, new):
  for elem in replacements:
    if elem in main:
      main = main.replace(elem, new)
  return main

def normalize(x):
  x = replaceMultiple(x, ['ά', 'ὰ', 'ἀ','ἁ','ἂ','ἃ','ἄ','ἅ','ἆ','ἇ','ᾰ','ᾱ','ᾲ','ᾳ','ᾴ','ᾶ','ᾷ'], 'α')
  x = replaceMultiple(x, ['έ','ὲ','ἐ','ἑ','ἒ','ἓ','ἔ','ἕ'], 'ε')
  x = replaceMultiple(x, ['ή','ὴ','ἠ','ἡ','ἢ','ἣ','ἤ','ἥ','ἦ','ἧ','ῂ','ῃ','ῄ','ῆ','ῇ'], 'η')
  x = replaceMultiple(x, ['ί','ὶ','ἰ','ἱ','ἲ','ἳ','ἴ','ἵ','ἶ','ἷ','ῐ','ῑ','ῒ','ΐ','ῖ','ῗ'], 'ι')
  x = replaceMultiple(x, ['ὸ','ό','ὀ','ὁ','ὂ','ὃ','ὄ','ὅ'], 'ο')
  x = replaceMultiple(x, ['ύ','ὺ','ΰ', 'ϋ','ὐ','ὑ','ὒ','ὓ','ὔ','ὕ','ὖ','ὗ','ῠ','ῡ','ῢ','ΰ','ῦ'], 'υ')
  x = replaceMultiple(x, ['ώ','ὼ','ῶ','ῲ','ῳ','ῴ','ῶ','ῷ','ὠ','ὡ','ὢ','ὣ','ὤ','ὥ','ὦ','ὧ'], 'ω')
  return x

def remove_emojis(text):
    return demoji.replace(text, '')

def sep_punc(x):
    punc = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~؛،؟؛.»«”'
    out = []
    for char in x:
        if char in punc:
            out.append(' ' + char + ' ')
        else:
            out.append(char)
    return ''.join(out)

def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    return set(stopwords)

def preprocess_text(text): #,stopwords_file):
      # Load stopwords
    #stopwords = load_stopwords(stopwords_file)
    # Remove retweets
    text = re.sub(r'^RT[\s]+', '', text, flags=re.IGNORECASE)
    # Remove usernames @
    text = re.sub(r'@[^\s]+', '', text)
    # Remove URLs
    text = re.sub(r'https\S+', '', text)
    url_words = ['url', 'URL', 'html', 'HTML', 'http', 'HTTP']
    for u in url_words:
        text = re.sub(u, '', text)
    text = text.lower()
    text=normalize(text)
     # Separate punctuation characters
    text = sep_punc(text)
    # Remove punctuation characters
    text = re.sub(f"[{''.join(re.escape(c) for c in string.punctuation if c != '_') }]", ' ', text)
    # Remove Latin characters
    text = re.sub(r'[a-zA-Z]+', '', text)
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
  # Remove words with character length equal to or less than 1
    words = text.split()
    filtered_words = [word for word in words if len(word) > 1]
    text = ' '.join(filtered_words)
    text = remove_emojis(text)
    text = str(text).upper().strip()
    #words = text.split()
    #filtered_words = [word for word in words if word not in stopwords]
    #text = ' '.join(filtered_words)
    tokenized_tweet = word_tokenize(text)
    tokenized_tweet = [stemWord(i) for i in tokenized_tweet]
    return ' '.join(tokenized_tweet)

In [None]:
ref_df = pd.read_csv('/content/drive/MyDrive/Πτυχιακή/Code/Experiments_Meltemi/train_dev_test_dataset/key_points_human_translated_test.csv')

In [None]:
pd.set_option("display.max_colwidth", None)

def preprocess_dataset(pred_df, ref_df):
    # Dynamically extract unique topics and stances from the pred_df
    topics = pred_df['topic'].unique()
    stances = pred_df['stance'].unique()

    predictions, references = [], []

    for topic in topics:
        for stance in stances:
            # Filter predicted keypoints from pred_df based on dynamic topic and stance
            kps = pred_df.loc[(pred_df['topic'] == topic) & (pred_df['stance'] == stance), 'Meltemi_Instruct_16shot'].tolist()

            # Filter reference keypoints from ref_df based on dynamic topic and stance
            gold_kps = ref_df.loc[(ref_df['topic'] == topic) & (ref_df['stance'] == stance), 'key_point'].tolist()

            predictions.append(kps)
            references.append(gold_kps)

    return predictions, references

In [None]:
predictions, references = preprocess_dataset(processed_df, ref_df)
print(predictions[5])
print(references[5])

In [None]:
def compute_rouge(predictions, references):   #,stopwords_file):
    # Load the Rouge metric from the evaluate library
    rouge = load('rouge')

    rouge1_scores, rouge2_scores, rougel_scores = [], [], []

    for preds, refs in zip(predictions, references):
        # Compute per topic average scores on all unique combinations of generated keypoints and ground-truth
        r_1, r_2, r_l = [], [], []
        # Preprocess the predicted keypoints and reference keypoints
        preds = [preprocess_text(pred) for pred in preds]
        print(preds)
        refs = [preprocess_text(ref) for ref in refs]
        print(refs)
        for a, b in product(preds, refs):
            # Use the evaluate library's rouge.compute() function to get the scores for each keypoint pair
            rouge_scores = rouge.compute(predictions=[a], references=[b], tokenizer=lambda x: x.split())

            # Since the values are floats, no need to access 'fmeasure'
            r_1.append(round(rouge_scores['rouge1'], 3))
            r_2.append(round(rouge_scores['rouge2'], 3))
            r_l.append(round(rouge_scores['rougeL'], 3))

        # Save per topic scores to compute average over all topics as the final score
        rouge1_scores.append(round(mean(r_1), 3))
        rouge2_scores.append(round(mean(r_2), 3))
        rougel_scores.append(round(mean(r_l), 3))

    # Print the average scores across all topics
    print("Rouge 1: {}".format(round(mean(rouge1_scores), 3)))
    print("Rouge 2: {}".format(round(mean(rouge2_scores), 3)))
    print("Rouge L: {}".format(round(mean(rougel_scores), 3)))

def compute_bertscore(predictions, references):
    bertscore_metric = load('bertscore')

    bertscore_precisions, bertscore_recalls, bertscore_f1s = [], [], []

    for preds, refs in zip(predictions, references):
        # Compute per-topic average scores on all unique combinations of generated keypoints and ground-truth
        r_1, r_2, r_l = [], [], []
        bert_prec, bert_recall, bert_f1 = [], [], []


        # Compute Rouge Scores
        for a, b in product(preds, refs):
            # Compute BERTScore without preprocessing
            bertscore_result = bertscore_metric.compute(predictions=[a], references=[b], lang="el")
            bert_prec.append(round(bertscore_result['precision'][0], 3))
            bert_recall.append(round(bertscore_result['recall'][0], 3))
            bert_f1.append(round(bertscore_result['f1'][0], 3))


        # Save per-topic BERTScore to compute average over all topics as the final score
        bertscore_precisions.append(round(mean(bert_prec), 3))
        bertscore_recalls.append(round(mean(bert_recall), 3))
        bertscore_f1s.append(round(mean(bert_f1), 3))

    # Print the average BERTScore scores across all topics
    print("BERTScore Precision: {}".format(round(mean(bertscore_precisions), 3)))
    print("BERTScore Recall: {}".format(round(mean(bertscore_recalls), 3)))
    print("BERTScore F1: {}".format(round(mean(bertscore_f1s), 3)))


In [None]:
compute_rouge(predictions, references)

In [None]:
compute_bertscore(predictions, references)