In [None]:
!pip install pandas numpy torch matplotlib hdbscan umap-learn scikit-learn sentence-transformers
!pip install bertopic==0.16.2
!pip install optuna
!pip install demoji
!pip install advertools

In [None]:
!apt-get update -q
!apt-get install -y -q libgomp1
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt --fix-broken install -y -q

In [None]:
!pip install -q kaleido

In [None]:
import os
os.environ["BROWSER_PATH"] = "/usr/bin/google-chrome-stable"

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import optuna
import demoji
from statistics import mean
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from optuna import create_study
from bertopic import BERTopic
from statistics import mean
import torch
import matplotlib.pyplot as plt
import hdbscan
import os
import io
import re
import string
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from umap import UMAP
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_param_importances
import seaborn as sns
import advertools as adv

In [None]:
def remove_emojis(text):
    return demoji.replace(text, '')

def preprocess_text(text):
    # Remove retweets
    text = re.sub(r'^RT[\s]+', '', text, flags=re.IGNORECASE)
    # Remove usernames @
    text = re.sub(r'@[^\s]+', '', text)
    # Remove URLs
    text = re.sub(r'https\S+', '', text)
    url_words = ['url', 'URL', 'html', 'HTML', 'http', 'HTTP']
    for u in url_words:
        text = re.sub(u, '', text)
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    text = remove_emojis(text)
    # Remove extra spaces that may have been introduced
    text = re.sub(r'\s+', ' ', text).strip()
    # Return preprocessed text
    return text


In [None]:
def load_dataset(input_file, stance, topic) -> pd.DataFrame:
    '''
    Each time we only deal with a single topic and stance.
    '''
    input_corpus = pd.read_csv(input_file)
    input_corpus = input_corpus[input_corpus["topic"] == topic]

    if stance == 1:
        input_corpus = input_corpus[input_corpus["stance"] == 1]
    else:
        input_corpus = input_corpus[input_corpus["stance"] == -1]

    input_corpus.reset_index(drop=True, inplace=True)

    # Debugging: Check if DataFrame is empty after filtering
    if input_corpus.empty:
        print("No data available for the specified topic and stance.")
        return pd.DataFrame()  # Return an empty DataFrame

    # Preprocess the arguments and store them in a new column
    preprocessed_data = [preprocess_text(arg) for arg in input_corpus['argument']]

    # Debugging: Check preprocessed data
    if not preprocessed_data:
        print("Preprocessed data is empty. Check the preprocessing function.")

    # Expand the tuples into separate columns
    input_corpus['preprocessed_arguments']= preprocessed_data

    # Debug output
    print("Loaded dataset:")
    print(input_corpus.head())
    print(f"Total records: {len(input_corpus)}")

    return input_corpus


#1st brute force approach with defaults

In [None]:
n_neighbors = 15  # Set your desired value
n_components = 5  # Set your desired value
min_samples_fraction = 1.0  # Set your desired value (as a fraction)
selected_arguments_path = '/content/drive/MyDrive/Πτυχιακή/Code/Experiments_Meltemi/train_dev_test_dataset/'
output_topic_data = './output_topic_data/'  # Output directory for topic data
output_arguments_data = './output_arguments_data/'  # Output directory for arguments data
stopwords = list(adv.stopwords['greek'])

# Set environment variable to avoid parallelism issues with tokenizers
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Set device for torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'


def topic_modeling(df, output_prefix):
    
    text_list = df['preprocessed_arguments'].tolist()
    embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", trust_remote_code=True)
    embeddings = embedding_model.encode(text_list)
    umap_model = UMAP(random_state=42, n_neighbors=n_neighbors, n_components=n_components, min_dist=0.00, metric='cosine')
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10,  #default
                                    metric='euclidean',
                                    cluster_selection_method='eom',
                                    prediction_data=True)
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords, lowercase=True)
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        low_memory=True,
        nr_topics=10,
        verbose=True,
        calculate_probabilities=True,
        language="multilingual"
    )

    topics, probabilities = topic_model.fit_transform(documents=text_list, embeddings=embeddings)

    topic_array = np.array(topics)
    embeddings_double_t = embeddings.astype('double')

    # Calculate DBCV and relative validity
    dbcv = hdbscan.validity_index(embeddings_double_t, labels=topic_array, metric='cosine')
    #relative_validity = hdbscan.validity.relative_validity_(embeddings_double_t, labels=topic_array, metric='cosine')

    df['topic_assignment'] = topics

    print(f"Number of topics: {len(df['topic_assignment'].unique())}")
    print(f"Number of outliers: {len(df[df.topic_assignment == -1])}")
    print(f"DBCV: {dbcv}")
    #print(f"Relative Validity: {relative_validity}")  # Print the relative validity

    # Generate topic labels with 5 representative words
    topic_labels_fig = topic_model.generate_topic_labels(nr_words=3,  # Updated to get 5 representative words
                                                         topic_prefix=False,
                                                         word_length=50,
                                                         separator=', ')
    topic_model.set_topic_labels(topic_labels_fig)

    # Create the figure for document visualization
    figure = topic_model.visualize_documents(docs=text_list,
                                             embeddings=embeddings,
                                             hide_annotations=False,
                                             custom_labels=True)

    os.makedirs(output_topic_data, exist_ok=True)
    # Save each plot with a unique filename based on the output_prefix
    #svg_filename = os.path.join(output_topic_data, f'{output_prefix}_cluster_plot.svg')
    html_filename = os.path.join(output_topic_data, f'{output_prefix}_cluster_plot.html')
    #figure.write_image(svg_filename)
    figure.write_html(html_filename)

    print(f"Cluster plot saved to:  {html_filename}")

    # Generate the topic counts and labels for the new DataFrame
    topic_counts = df['topic_assignment'].value_counts().reset_index()
    topic_counts.columns = ['topic', 'count']

    # Correctly map topic IDs to labels using BERTopic's topic representation
    topic_labels_dict = {topic_id: ', '.join([word for word, _ in topic_model.get_topic(topic_id)])
                         for topic_id in topic_counts['topic'].unique() if topic_id != -1}

    # Handle outliers (-1) separately
    topic_labels_dict[-1] = "Outliers"

    # Map the topic labels to the DataFrame
    topic_counts['topic_label'] = topic_counts['topic'].map(topic_labels_dict)

    # Save the updated DataFrame with topic counts and labels
    topic_summary_df = topic_counts[['topic', 'count', 'topic_label']]
    summary_output_path = os.path.join(output_topic_data, f"topic_summary_{output_prefix}.csv")
    topic_summary_df.to_csv(summary_output_path, index=False)

    print(f"Topic summary saved to: {summary_output_path}")

    return df, embeddings, dbcv  #, relative_validity  # Return the relative validity score

In [None]:
# Load the main dataset
arguments_file_path = '/content/drive/MyDrive/Πτυχιακή/Code/Experiments_Meltemi/train_dev_test_dataset/arguments_human_translated_dev.csv'
arguments_df = pd.read_csv(arguments_file_path)

# Get unique combinations of topic and stance
unique_combinations = arguments_df[['topic', 'stance']].drop_duplicates()

# Initialize lists for results and metrics
results = []
dbcv_scores = []

seed_value = 42

# Process each unique topic and stance combination
for _, row in unique_combinations.iterrows():
    topic = row['topic']
    stance = row['stance']
    print(f"Processing topic: {topic}, stance: {stance}")

    # Load dataset based on the topic and stance
    filtered_df = load_dataset(arguments_file_path, stance, topic)

    if not filtered_df.empty:
        # Apply topic modeling
        output_prefix = f"{topic}_{stance}".replace(" ", "_")
        processed_df, embeddings, dbcv = topic_modeling(filtered_df, output_prefix)
        results.append(processed_df)
        dbcv_scores.append(dbcv)
    else:
        print(f"No data found for topic: {topic}, stance: {stance}")

# Compute average DBCV score
average_dbcv = np.mean(dbcv_scores)
print(f"Average DBCV score across all topics: {average_dbcv}")

#Optuna Hyperparameter tuning

In [None]:
def topic_modeling(df, output_prefix, n_neighbors, n_components, min_samples_fraction, cluster_selection_method):
    cluster_size = int(len(df) / 50)
    if cluster_size < 3:
        cluster_size = 3

    print(f"Cluster size: {cluster_size}")

    min_samples = int(min_samples_fraction * cluster_size)
    if min_samples < 2:
        min_samples = 2

    print(f"Min samples: {min_samples}")

    text_list = df['argument'].tolist()
    embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", trust_remote_code=True)

    embeddings = embedding_model.encode(text_list)

    # Adjust hyperparameters passed to UMAP and HDBSCAN
    umap_model = UMAP(random_state=42, n_neighbors=n_neighbors, n_components=n_components, min_dist=0.00, metric='cosine')
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=cluster_size,
                                    metric='euclidean',
                                    cluster_selection_method=cluster_selection_method,  # Use the passed parameter
                                    min_samples=min_samples,
                                    prediction_data=True)
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords, lowercase=True)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=10,
        verbose=True,
        calculate_probabilities=True
    )

    topics, probabilities = topic_model.fit_transform(documents=text_list, embeddings=embeddings)

    topic_array = np.array(topics)
    embeddings_double_t = embeddings.astype('double')

    # Calculate DBCV score
    dbcv = hdbscan.validity_index(embeddings_double_t, labels=topic_array, metric='cosine')

    print(f"DBCV: {dbcv}")
    return dbcv


def objective(trial, df, csv_file):
    # Suggest values for each hyperparameter
    n_neighbors = trial.suggest_categorical('n_neighbors', [3,5,10,15])
    n_components = trial.suggest_categorical('n_components', [2,5,7])
    min_samples_fraction = trial.suggest_categorical('min_samples_fraction', [0.5, 1.0])
    cluster_selection_method = trial.suggest_categorical('cluster_selection_method', ['eom', 'leaf'])  # New parameter

    # Perform topic modeling and get the DBCV score
    dbcv_score = topic_modeling(
        df,
        output_prefix=os.path.splitext(csv_file)[0],
        n_neighbors=n_neighbors,
        n_components=n_components,
        min_samples_fraction=min_samples_fraction,
        cluster_selection_method=cluster_selection_method  # Pass the new parameter
    )
    # Report intermediate DBCV score to Optuna
    trial.report(dbcv_score, step=trial.number)

    # Handle pruning based on the intermediate value.
    if trial.should_prune():
      raise optuna.exceptions.TrialPruned()

    return dbcv_score


In [None]:
selected_arguments_path = '/kaggle/input/meltemi-data/'
# Output directories for topic and arguments data
output_topic_data = './'
output_arguments_data = './'

# Set environment variable to avoid parallelism issues with tokenizers
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Set device for torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load the main CSV file that contains topics and stances
arguments_file_path = os.path.join(selected_arguments_path, 'arguments_human_translated_dev.csv')
arguments_df = pd.read_csv(arguments_file_path)

# Get unique combinations of topic and stance
unique_combinations = arguments_df[['topic', 'stance']].drop_duplicates()

# Initialize a list to store results
results = []
dbcv_scores = []  # List to store DBCV scores for averaging
seed_value = 42



# Iterate through each unique combination of topic and stance
for _, row in unique_combinations.iterrows():
    topic = row['topic']
    stance = row['stance']
    print(f"Processing topic: {topic}, stance: {stance}")

    # Load dataset based on the topic and stance
    filtered_df = load_dataset(arguments_file_path, stance, topic)

    # Check if the filtered dataset is not empty
    if not filtered_df.empty:
        # Create an Optuna study for each topic-stance combination
        study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed_value), pruner=MedianPruner())

        # Optimize the objective function for the current dataset
        study.optimize(lambda trial: objective(trial, filtered_df, f"{topic}_{stance}"), n_trials=100)

        # Save the best trial results for each combination
        print(f"Best trial for topic: {topic}, stance: {stance}")
        trial = study.best_trial
        print(f"DBCV Score: {trial.value}")
        print(f"Best hyperparameters: {trial.params}")

        # Save the optimization history plot
        optimization_history = plot_optimization_history(study)
        optimization_history_image_path = os.path.join(output_topic_data, f"{topic}_{stance}_optimization_history.png")
        optimization_history.write_image(optimization_history_image_path)
        print(f"Optimization history saved to: {optimization_history_image_path}")

        # Save the parallel coordinate plot
        parallel_coordinate = plot_parallel_coordinate(study)
        parallel_coordinate_image_path = os.path.join(output_topic_data, f"{topic}_{stance}_parallel_coordinate.png")
        parallel_coordinate.write_image(parallel_coordinate_image_path)
        print(f"Parallel coordinate plot saved to: {parallel_coordinate_image_path}")

        # Save the hyperparameter importances plot
        param_importance = plot_param_importances(study)
        param_importance_image_path = os.path.join(output_topic_data, f"{topic}_{stance}_param_importances.png")
        param_importance.write_image(param_importance_image_path)
        print(f"Hyperparameter importances plot saved to: {param_importance_image_path}")

        # Store the DBCV score for averaging
        dbcv_scores.append(trial.value)

        # Store results
        results.append({
            'topic': topic,
            'stance': stance,
            'dbcv_score': trial.value,
            'best_hyperparameters': trial.params
        })

# Output all results
for result in results:
    print(f"Topic: {result['topic']}, Stance: {result['stance']}")
    print(f"Best DBCV Score: {result['dbcv_score']}")
    print(f"Best Hyperparameters: {result['best_hyperparameters']}")
    print("-" * 50)

# Calculate and display the average DBCV score across all topic-stance combinations
if dbcv_scores:
    average_dbcv_score = sum(dbcv_scores) / len(dbcv_scores)
    print(f"\nAverage DBCV Score across all topic-stance combinations: {average_dbcv_score:.4f}")
else:
    print("No DBCV scores were obtained.")
