In [None]:
import sys
import os
import pathlib
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sklearn.manifold import TSNE
import pandas as pd
import hdbscan
from sqlalchemy import func, Date
from db import models
import numpy as np
import plotly.express as px
import random

# Database configuration
SQLALCHEMY_DATABASE_URL = "postgresql://barometrs:password@127.0.0.1:5433/barometrs"

engine = create_engine(SQLALCHEMY_DATABASE_URL)
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
def get_article_clusters():
    session.rollback()

    start_date = '2020-01-01'
    article_language = 'lv'

    query = session.query(
        models.RawArticle.article_id.label('id'),
        models.RawArticle.headline.label('article_title'),
        models.RawArticle.embedding.label('embedding'),
    ).join(
        models.RawArticle.predicted_comments  # Join with comments
    ).filter(
        func.cast(models.RawArticle.pub_timestamp, Date) >= start_date,
        models.RawArticle.embedding != None,
        models.RawArticle.headline_lang == article_language,
        models.PredictedComment.text_lang == article_language,
    ).group_by(
        models.RawArticle.article_id,
        models.RawArticle.headline,
        models.RawArticle.embedding
    ).having(
        func.count(models.PredictedComment.id) >= 10,
    )

    results = query.all()

    print('Number of articles: ' + str(len(results)))

    if not results:
        return None

    df = pd.DataFrame(results, columns=['id', 'article_title', 'embedding'])
    articles_df = df[['id', 'article_title', 'embedding']].drop_duplicates(subset=['id'])
    embeddings = np.array(articles_df['embedding'].tolist())

    # Reduce dimensionality for more efficient clustering
    from sklearn.manifold import TSNE
    tsne = TSNE(
        n_components=2,
        random_state=42,
        method='barnes_hut',
        n_jobs=-1
    )
    embeddings_2d = tsne.fit_transform(embeddings)

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=100,
        min_samples=100,
        metric='euclidean',
        algorithm='boruvka_kdtree',
        core_dist_n_jobs=-1,
        cluster_selection_method='leaf'
    )
    clusters = clusterer.fit_predict(embeddings_2d)  # Cluster on the 2D projection

    # Add cluster information to articles dataframe
    articles_df['cluster'] = clusters

    return articles_df

clustered_articles = get_article_clusters()
clustered_articles

In [None]:
df = clustered_articles.copy()

# drop -1 cluster
df = df[df['cluster'] != -1]


# Extract embeddings from dataframe - they should already be stored as lists or arrays
embeddings = np.array(df['embedding'].tolist())

# Generate t-SNE embeddings with the same parameters you used for clustering
tsne = TSNE(
    n_components=2,
    random_state=42,
    method='barnes_hut',
    n_jobs=-1
)
embeddings_2d = tsne.fit_transform(embeddings)

# Add the t-SNE coordinates to the dataframe
df['tsne_1'] = embeddings_2d[:, 0]
df['tsne_2'] = embeddings_2d[:, 1]

# Convert cluster to string for better visualization
df['cluster_str'] = df['cluster'].astype(str)

# Create an interactive scatter plot with t-SNE dimensions
fig_tsne = px.scatter(
    df,
    x='tsne_1',
    y='tsne_2',
    color='cluster_str',
    hover_name='article_title',
    hover_data=['id'],
    title='Article Clusters with t-SNE',
    labels={'tsne_1': 't-SNE Component 1',
            'tsne_2': 't-SNE Component 2'},
    opacity=0.7,
    size_max=10
)

# Improve layout
fig_tsne.update_layout(
    legend_title_text='Cluster',
    width=1000,
    height=800
)

# Analyze cluster statistics
cluster_stats = df.groupby('cluster').agg({
    'id': 'count'
}).rename(columns={'id': 'article_count'}).sort_values('article_count', ascending=False)

# Create a bar chart for cluster sizes
fig_bar = px.bar(
    cluster_stats.reset_index(),
    x='cluster',
    y='article_count',
    title='Articles per Cluster',
    labels={'cluster': 'Cluster', 'article_count': 'Number of Articles'},
    color='cluster',
    text='article_count'
)

fig_bar.update_traces(textposition='outside')

# Function to get top articles from each cluster
def get_cluster_samples(df, n=5):
    samples = pd.DataFrame()
    for cluster in sorted(df['cluster'].unique()):
        cluster_df = df[df['cluster'] == cluster]
        if len(cluster_df) > 0:
            sample = cluster_df.sample(min(n, len(cluster_df)))
            samples = pd.concat([samples, sample])
    return samples

cluster_samples = get_cluster_samples(df)

# Interactive function to explore specific clusters
def explore_cluster(df, cluster_id):
    cluster_df = df[df['cluster'] == cluster_id]
    print(f"Cluster {cluster_id} has {len(cluster_df)} articles")

    # Create a scatter plot for this cluster
    fig = px.scatter(
        cluster_df,
        x='tsne_1',
        y='tsne_2',
        hover_name='article_title',
        hover_data=['id'],
        title=f'Articles in Cluster {cluster_id}',
        labels={'tsne_1': 't-SNE Component 1',
                'tsne_2': 't-SNE Component 2'}
    )

    # Add a table with sample articles from this cluster
    print("\nSample articles from this cluster:")
    display(cluster_df[['id', 'article_title']].head(10))

    return fig

# Create a density map visualization to understand cluster density
fig_density = px.density_contour(
    df,
    x='tsne_1',
    y='tsne_2',
    title='Density Map of Articles in Embedding Space'
)
fig_density.update_traces(contours_coloring="fill", contours_showlabels=True)

# Create a 3D visualization using t-SNE for 2 dimensions and cluster as third dimension
fig_3d = px.scatter_3d(
    df,
    x='tsne_1',
    y='tsne_2',
    z='cluster',
    color='cluster_str',
    hover_name='article_title',
    title='3D Visualization of Clusters',
    opacity=0.7
)

def identify_cluster_boundaries(df):
    # Create a new dataframe with t-SNE coordinates and cluster info
    cluster_data = []

    for cluster in sorted(df['cluster'].unique()):
        if cluster == -1:  # Skip noise points
            continue

        cluster_df = df[df['cluster'] == cluster]

        # Calculate centroid for this cluster
        centroid_x = cluster_df['tsne_1'].mean()
        centroid_y = cluster_df['tsne_2'].mean()

        # Calculate distance from each point to centroid
        for _, row in cluster_df.iterrows():
            distance = np.sqrt((row['tsne_1'] - centroid_x)**2 + (row['tsne_2'] - centroid_y)**2)

            cluster_data.append({
                'id': row['id'],
                'article_title': row['article_title'],
                'cluster': row['cluster'],
                'distance_to_centroid': distance
            })

    boundary_df = pd.DataFrame(cluster_data)

    # Determine core vs boundary points (simplified approach)
    # We'll consider points in the 25th percentile as core, and the rest as boundary
    boundary_df['point_type'] = 'boundary'

    for cluster in boundary_df['cluster'].unique():
        cluster_distances = boundary_df[boundary_df['cluster'] == cluster]['distance_to_centroid']
        threshold = cluster_distances.quantile(0.25)
        boundary_df.loc[(boundary_df['cluster'] == cluster) &
                        (boundary_df['distance_to_centroid'] <= threshold), 'point_type'] = 'core'

    return boundary_df

# Create the core/boundary analysis
boundary_df = identify_cluster_boundaries(df)

# Visualize core vs boundary points
fig_boundary = px.scatter(
    boundary_df,
    x='distance_to_centroid',
    y='cluster',
    color='point_type',
    hover_name='article_title',
    title='Core vs Boundary Articles by Cluster',
    labels={'distance_to_centroid': 'Distance to Cluster Center',
            'cluster': 'Cluster ID',
            'point_type': 'Article Position'},
    opacity=0.7
)

# Display all the visualizations
print("Distribution of articles across clusters:")
display(cluster_stats)

print("\nSample articles from each cluster:")
display(cluster_samples[['cluster', 'article_title']])

fig_tsne.show()
fig_bar.show()
fig_density.show()
fig_3d.show()
fig_boundary.show()

In [None]:
raw_comments = session.query(
    models.PredictedComment.id.label('comment_id'),
    models.PredictedComment.article_id.label('article_id'),
    models.PredictedComment.text,
    models.PredictedComment.text_lang,
    models.PredictedComment.ekman_prediction_emotion.label('emotion'),
    models.PredictedComment.ekman_prediction_score.label('confidence'),
    models.PredictedComment.comment_timestamp,
    models.PredictedComment.comment_id,
).join(
    models.RawArticle,
    models.PredictedComment.article_id == models.RawArticle.article_id
).filter(
    func.cast(models.RawArticle.pub_timestamp, Date) >= '2020-01-01',
    models.RawArticle.embedding != None,
    models.RawArticle.headline_lang == 'lv',
    models.PredictedComment.text_lang == 'lv',
    models.PredictedComment.ekman_prediction_emotion != 'neutral',
).all()

In [None]:
# convert to dataframe
df_comments = pd.DataFrame(raw_comments, columns=['comment_id', 'article_id', 'text', 'text_lang', 'emotion', 'confidence', 'timestamp', 'comment_id'])

# assign cluster_id to comments
df_comments['cluster_id'] = df_comments['article_id'].map(clustered_articles.set_index('id')['cluster'])

# drop comments without cluster_id or with cluster_id -1
df_comments = df_comments.dropna(subset=['cluster_id'])
df_comments = df_comments[df_comments['cluster_id'] != -1]

# add length of comment
df_comments['length'] = df_comments['text'].apply(lambda x: len(x.split()))

df_comments['confidence_cat'] = pd.cut(df_comments['confidence'], bins=[0, 0.33, 0.66, 1], labels=['low', 'medium', 'high'])
df_comments['length_cat'] = pd.cut(df_comments['length'], bins=[0, 20, 50, 100, 200], labels=['short', 'medium', 'long', 'very_long'])


df_comments

In [None]:
def sample_with_quota_by_cluster(df_comments, total_samples=2000):
    """
    Sample comments using a straightforward quota approach, working through clusters
    until we have the desired number of comments for each emotion.

    Parameters:
    -----------
    df_comments : pandas DataFrame
        Dataframe containing comments with emotion, cluster_id, etc.
    total_samples : int
        Total number of comments to sample (default: 2000)

    Returns:
    --------
    pandas DataFrame
        Sampled comments with balanced emotion distribution
    """
    # Get unique emotions and set targets
    emotions = df_comments['emotion'].unique()
    target_per_emotion = total_samples // len(emotions)

    # Initialize tracking
    collected = {emotion: 0 for emotion in emotions}
    final_sample = []

    # Get list of clusters, sorted by size (largest first)
    cluster_sizes = df_comments.groupby('cluster_id').size().sort_values(ascending=False)
    cluster_ids = cluster_sizes.index.tolist()

    # Also shuffle to avoid bias if sizes are similar
    np.random.seed(42)
    np.random.shuffle(cluster_ids)

    # Maximum to take from any one cluster for each emotion
    max_per_cluster = 6

    # Iterate through clusters
    for cluster_id in cluster_ids:
        cluster_df = df_comments[df_comments['cluster_id'] == cluster_id]

        # For each emotion, take a small sample from this cluster
        for emotion in emotions:
            # Check if we still need more of this emotion
            remaining = target_per_emotion - collected[emotion]
            if remaining <= 0:
                continue

            # Get comments for this emotion in this cluster
            emotion_df = cluster_df[cluster_df['emotion'] == emotion]

            # Determine how many to take
            take_count = min(
                remaining,          # Don't exceed what we need
                len(emotion_df),    # Don't exceed what's available
                max_per_cluster     # Don't take too many from one cluster
            )

            if take_count > 0:
                # If we need to stratify by confidence and length, do it here
                if len(emotion_df) > take_count and 'confidence_cat' in emotion_df.columns:
                    # Simple stratified sampling
                    strata = emotion_df.groupby(['confidence_cat', 'length_cat'], observed=True)
                    sampled = pd.DataFrame()

                    # Calculate number to take from each stratum
                    total_in_strata = sum(len(group) for _, group in strata)

                    for (_, _), group in strata:
                        if len(group) == 0:
                            continue

                        # Proportional allocation
                        stratum_count = int(np.ceil(take_count * len(group) / total_in_strata))
                        stratum_count = min(stratum_count, len(group))

                        sampled = pd.concat([sampled, group.sample(stratum_count, random_state=42)])

                    # Adjust to exactly take_count
                    if len(sampled) > take_count:
                        sampled = sampled.sample(take_count, random_state=42)
                    elif len(sampled) < take_count and len(emotion_df) > len(sampled):
                        # Add more if needed
                        additional = emotion_df[~emotion_df.index.isin(sampled.index)]
                        additional = additional.sample(min(take_count - len(sampled), len(additional)), random_state=42)
                        sampled = pd.concat([sampled, additional])
                else:
                    # Simple random sampling
                    sampled = emotion_df.sample(take_count, random_state=42)

                # Add to our collection
                final_sample.append(sampled)
                collected[emotion] += len(sampled)

        # Check if we've hit all our targets
        if all(collected[emotion] >= target_per_emotion for emotion in emotions):
            break

    # Combine all samples and shuffle
    final_df = pd.concat(final_sample, ignore_index=True)
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Print summary
    print("Sample summary:")
    print(f"Total comments: {len(final_df)}")
    for emotion in emotions:
        print(f"{emotion}: {collected[emotion]} comments")
    print(f"From {final_df['cluster_id'].nunique()} unique clusters")

    return final_df

result = sample_with_quota_by_cluster(df_comments.copy())

In [None]:
result

In [None]:
# save to csv using absolute path

In [None]:
# convert timestampt to date
copy_result = result.copy()
copy_result['timestamp'] = pd.to_datetime(copy_result['timestamp']).dt.date

# group by date
grouped = copy_result.groupby(['timestamp', 'emotion']).size().reset_index(name='count')

fig = px.line(
    grouped,
    x='timestamp',
    y='count',
    color='emotion',
    title='Number of Comments per Day by Emotion',
    labels={'timestamp': 'Date', 'count': 'Number of Comments'},
    markers=True
)
fig.show()