In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

with open("output/user_comments/all_users_1.pkl", "rb") as f:
    all_user_comments = pickle.load(f)
all_comments = [comment for comments in all_user_comments.values() for comment in comments]
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(all_comments)
os.makedirs("output/vectorizers", exist_ok=True)
with open("output/vectorizers/vectorizer_1.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def user_comment_similarity(user_comments, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer(lowercase=True)
        X = vectorizer.fit_transform(user_comments)
    else:
        X = vectorizer.transform(user_comments)
    # Compute pairwise cosine similarity matrix
    sim_matrix = cosine_similarity(X)
    # Take upper triangle (excluding diagonal), flatten, and average
    n = sim_matrix.shape[0]
    upper_tri_indices = np.triu_indices(n, k=1)
    avg_similarity = sim_matrix[upper_tri_indices].mean()
    return avg_similarity

with open("output/user_comments/all_users_1.pkl", "rb") as f:
    all_user_comments = pickle.load(f)
all_comments = [comment for comments in all_user_comments.values() for comment in comments]

with open("output/vectorizers/vectorizer_1.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# Compute similarity for one user
user_id = "AutoModerator"
user_comments = all_user_comments[user_id][:1000]  # only take first 1000 comments
if len(user_comments) >= 5:
    sim_score = user_comment_similarity(user_comments, vectorizer)
    print(f"User: {user_id}, Similarity: {sim_score}")

User: AutoModerator, Similarity: 0.6393157601835324


In [None]:
# Show users sorted by number of comments (high to low)
user_comment_counts = [(user, len(comments)) for user, comments in all_user_comments.items()]
user_comment_counts.sort(key=lambda x: x[1], reverse=True)

for user, count in user_comment_counts[:20]:  # Show top 20 users
    print(f"User: {user}, Number of comments: {count}")