In [None]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # URLs
    text = re.sub(r"@\w+", '', text)                    # mentions
    text = re.sub(r"#", '', text)                       # remove hash
    text = re.sub(r"RT", '', text)                      # retweet
    text = re.sub(r"[^\w\s]", '', text)                 # punctuation
    return text.lower().strip()

In [None]:
def get_top_10_recent_tweets(tweets_list):
    """Get 10 most recent tweets from a list"""
    if len(tweets_list) <= 10:
        return tweets_list
    # Sort by created_at and get top 10
    sorted_tweets = sorted(tweets_list, key=lambda x: x['created_at'], reverse=True)
    return sorted_tweets[:10]

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
from numpy.fft import rfft, rfftfreq

# Funzione di feature extraction per ogni utente
def extract_user_features(user_df):
    times = []
    for d in user_df:
        times.append(d['created_at'])

    times = pd.Series(pd.to_datetime(times)).sort_values()
    hours = times.dt.hour.values
    days = times.dt.dayofweek.values

    # intervalli tra tweet (in minuti)
    deltas = times.diff().dt.total_seconds().dropna() / 60.0

    features = {}
    features["n_tweets"] = len(user_df)

    # 1. media e varianza delle ore
    features["mean_hour"] = np.mean(hours) if len(hours) > 0 else np.nan
    features["std_hour"] = np.std(hours) if len(hours) > 0 else np.nan

    # 2. giorno della settimana
    features["mean_dayofweek"] = np.mean(days) if len(days) > 0 else np.nan
    features["std_dayofweek"] = np.std(days) if len(days) > 0 else np.nan

    # 3. weekend vs weekday
    features["pct_weekend"] = np.mean(np.isin(days, [5, 6])) if len(days) > 0 else np.nan

    # 4. parte del giorno
    features["pct_night"] = np.mean((hours >= 0) & (hours < 6)) if len(hours) > 0 else np.nan
    features["pct_morning"] = np.mean((hours >= 6) & (hours < 12)) if len(hours) > 0 else np.nan
    features["pct_afternoon"] = np.mean((hours >= 12) & (hours < 18)) if len(hours) > 0 else np.nan
    features["pct_evening"] = np.mean((hours >= 18) & (hours < 24)) if len(hours) > 0 else np.nan

    # 5. entropia distribuzione oraria
    if len(hours) > 0:
        counts_per_hour = np.bincount(hours, minlength=24)
        probs = counts_per_hour / counts_per_hour.sum() if counts_per_hour.sum() > 0 else np.zeros(24)
        features["entropy_hours"] = entropy(probs)
    else:
        features["entropy_hours"] = np.nan

    # 6. max tweets in una stessa ora (burst su scala 1h)
    if len(times) > 0:
        counts = times.dt.floor("h").value_counts()
        features["max_tweets_per_hour"] = counts.max() if len(counts) > 0 else 0
    else:
        features["max_tweets_per_hour"] = np.nan

    # 7. pause medie tra tweet
    if len(deltas) > 0:
        features["mean_gap_min"] = np.mean(deltas)
        if len(deltas) > 1:
            features["std_gap_min"] = np.std(deltas)
            features["cv_gap"] = np.std(deltas) / (np.mean(deltas) + 1e-8)
        else:
            features["std_gap_min"] = np.nan
            features["cv_gap"] = np.nan
    else:
        features["mean_gap_min"] = np.nan
        features["std_gap_min"] = np.nan
        features["cv_gap"] = np.nan

    # 8. autocorrelazione dei gap (lag 1)
    if len(deltas) > 1:
        deltas_centered = deltas - deltas.mean()
        if len(deltas_centered) > 1 and np.std(deltas_centered) > 0:
            autocorr = np.corrcoef(deltas_centered[:-1], deltas_centered[1:])[0, 1]
        else:
            autocorr = np.nan
        features["gap_autocorr"] = autocorr
    else:
        features["gap_autocorr"] = np.nan

    # 9. Fourier transform sui gap (dominant frequency)
    if len(deltas) > 5:
        yf = np.abs(rfft(deltas - np.mean(deltas)))
        xf = rfftfreq(len(deltas), 1)  # unità arbitraria
        dominant_freq = xf[np.argmax(yf[1:]) + 1] if len(yf) > 1 else 0
        features["dominant_gap_freq"] = dominant_freq
    else:
        features["dominant_gap_freq"] = np.nan

    # 10. circular encoding per ore
    if len(hours) > 0:
        features["mean_hour_sin"] = np.mean(np.sin(2 * np.pi * hours / 24))
        features["mean_hour_cos"] = np.mean(np.cos(2 * np.pi * hours / 24))
    else:
        features["mean_hour_sin"] = np.nan
        features["mean_hour_cos"] = np.nan

    # 13. burst activity in 10 minuti (massimo numero di tweet in 10min)
    if len(times) > 0:
        ts_series = pd.Series(1, index=times)   # valore fittizio = 1
        rolling_counts = ts_series.rolling("10min").sum()
        features["max_tweets_10min"] = rolling_counts.max()
    else:
        features["max_tweets_10min"] = np.nan

    return features

In [None]:
# Load sentence transformer model
print("Loading sentence transformer model...")
model = SentenceTransformer('arcos02/roberta-base-bne-finetuned-twitter_DANA2')

In [None]:
# Array to track already processed authors
processed_authors = set()

# Process each dataset as the "first" dataset
for first_dataset_idx in range(9):
    print(f"\n{'='*50}")
    print(f"Processing with dataset {first_dataset_idx} as base")
    print(f"{'='*50}")

    # Skip if this dataset's authors are already processed
    file_path = f'Datasets/tweet_{first_dataset_idx}.json'
    if not os.path.exists(file_path):
        print(f"File {file_path} not found, skipping...")
        continue

    # 1. Read first dataset and create author dict
    print(f"Reading base dataset: tweet_{first_dataset_idx}.json")
    df_first = pd.read_json(file_path)
    print(f"Shape tweet_{first_dataset_idx}: {df_first.shape}")
    df_first['created_at'] = pd.to_datetime(df_first['created_at'])

    # Create dict with authors from first dataset
    author_dict = {}
    new_authors = []

    for _, row in df_first.iterrows():
        author_id = row['author_id']

        # Only process authors not already computed
        if author_id not in processed_authors:
            if author_id not in author_dict:
                author_dict[author_id] = []
                new_authors.append(author_id)

            author_dict[author_id].append({
                'text': row['text'],
                'created_at': row['created_at']
            })

    if not new_authors:
        print(f"No new authors found in dataset {first_dataset_idx}, skipping...")
        continue

    print(f"Found {len(new_authors)} new authors")

    # 2. Read other datasets and add tweets from existing authors
    for other_idx in range(first_dataset_idx, 9):
        if other_idx == first_dataset_idx:
            continue

        other_file = f'Datasets/tweet_{other_idx}.json'
        if not os.path.exists(other_file):
            continue

        print(f"Reading additional tweets from: tweet_{other_idx}.json")
        df_other = pd.read_json(other_file)
        df_other['created_at'] = pd.to_datetime(df_other['created_at'])

        # Add tweets only from authors already in our dict
        added_count = 0
        for _, row in df_other.iterrows():
            author_id = row['author_id']
            if author_id in author_dict:
                author_dict[author_id].append({
                    'text': row['text'],
                    'created_at': row['created_at']
                })
                added_count += 1

        print(f"  Added {added_count} tweets from existing authors")

    # 3. Get top 10 recent tweets per author and compute embeddings
    print("Processing tweets and computing embeddings...")
    user_data = []

    for author_id in tqdm(new_authors, desc="Computing embeddings"):
        # Get top 10 recent tweets
        recent_tweets = get_top_10_recent_tweets(author_dict[author_id])

        # Clean texts
        clean_texts = [clean_text(tweet['text']) for tweet in recent_tweets]
        clean_texts = [text if text.strip() else "empty" for text in clean_texts]

        # Compute embeddings
        embeddings = model.encode(clean_texts)
        mean_embedding = np.mean(embeddings, axis=0)

        times = extract_user_features(author_dict[author_id])
        times['author_id'] = author_id
        #times['author_id'] = times["author_id"].astype(int)
        times.update({
            'author_id': author_id,
            #'mean_embedding': mean_embedding,
            #'tweet_count': len(recent_tweets),
            'total_tweets_found': len(author_dict[author_id]),
            #'latest_tweet': max(tweet['created_at'] for tweet in recent_tweets)
        })
        user_data.append(times)



    # 4. Save results
    if user_data:
        print(f"Saving results for {len(user_data)} users...")

        # Extract embeddings matrix
        embeddings_matrix = np.vstack([user['mean_embedding'] for user in user_data])

        # Save embeddings
        embedding_file = f'Datasets/user_embeddings_{first_dataset_idx}.npy'
        np.save(embedding_file, embeddings_matrix)

        # Create and save user mapping
        mapping_data = []
        for i, user in enumerate(user_data):
            mapping_data.append(user)

        mapping_df = pd.DataFrame(mapping_data)
        mapping_file = f'Datasets/user_mapping_{first_dataset_idx}.csv'
        mapping_df.to_csv(mapping_file, index=False)

        # Add processed authors to the set
        processed_authors.update(new_authors)

        #print(f"✓ Saved {embedding_file} - Shape: {embeddings_matrix.shape}")
        print(f"✓ Saved {mapping_file} - {len(mapping_data)} users")
        print(f"✓ Total authors processed so far: {len(processed_authors)}")

In [None]:
print(f"\nAll processing complete!")
print(f"Total unique authors processed: {len(processed_authors)}")

In [None]:
df = pd.DataFrame()
emb_list = []
for i in range(9):
    emb = np.load(f'Datasets/user_embeddings_{i}.npy')
    emb_list.append(emb)
    users = pd.read_csv(f"Datasets/user_mapping_{i}.csv")
    df = pd.concat([df, users], ignore_index=True)

In [None]:
emb_list = np.concatenate(emb_list, axis=0)
df.shape, emb_list.shape

In [None]:
df.to_csv('Datasets/v2_top10_embeddings_users.csv')
np.save('Datasets/v2_top10_embeddings.npy', emb_list)

In [None]:
users.head()

In [None]:
df = pd.read_csv('Datasets/v2_top10_embeddings_users.csv')

In [None]:
labels = pd.read_csv('Datasets/label.csv')
splits = pd.read_csv('Datasets/split.csv')

In [None]:
def add_u_prefix(author_id):
    if str(author_id).startswith('u'):
        author_id = int(str(author_id)[1:])
    return 'u' + str(int(author_id))


In [None]:
df.head()

In [None]:
labels.head()

In [None]:
splits.head()

In [None]:
df['author_id'] = df['author_id'].apply(add_u_prefix)
labels['id'] = labels['id'].apply(add_u_prefix)
splits['id'] = splits['id'].apply(add_u_prefix)

In [None]:
df.shape, labels.shape, splits.shape

In [None]:
info = pd.merge(labels, splits, how='inner', on='id')
info = info.rename(columns={'id': 'author_id'})
info.shape

In [None]:
info.head()

In [None]:
df['author_id'].head()

In [None]:
df_merged = pd.merge(df, info, how='inner', on='author_id')
df_merged.drop(['Unnamed: 0'], axis=1, inplace=True)
df_merged.shape

In [None]:
df_merged.head()

In [None]:
df_merged.to_csv('Datasets/v2_top10_embeddings_users.csv', index=False)

In [None]:
df = pd.read_csv('Datasets/v2_top10_embeddings_users.csv')
df.head()

In [None]:
df.shape

In [None]:
df['n_tweets'].median()