In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

with open("output/user_comments/all_users_1.pkl", "rb") as f:
    all_user_comments = pickle.load(f)
all_comments = [comment for comments in all_user_comments.values() for comment in comments]
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(all_comments)
os.makedirs("output/vectorizers", exist_ok=True)
with open("output/vectorizers/vectorizer_1.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

def user_comment_similarity(user_comments, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer(lowercase=True)
        X = vectorizer.fit_transform(user_comments)
    else:
        X = vectorizer.transform(user_comments)
    # Compute pairwise cosine similarity matrix
    sim_matrix = cosine_similarity(X)
    # Take upper triangle (excluding diagonal), flatten, and average
    n = sim_matrix.shape[0]
    upper_tri_indices = np.triu_indices(n, k=1)
    avg_similarity = sim_matrix[upper_tri_indices].mean()
    return avg_similarity

with open("output/user_comments/all_users_1.pkl", "rb") as f:
    all_user_comments = pickle.load(f)
all_comments = [comment for comments in all_user_comments.values() for comment in comments]

with open("output/vectorizers/vectorizer_1.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# Compute similarity for one user
user_id = "RedBaronsBrother"
user_comments_all = all_user_comments[user_id]
print(f"User: {user_id}, Total comments: {len(user_comments_all)}")

if len(user_comments_all) >= 5:
    sample_size = min(1000, len(user_comments_all))
    user_comments = list(np.random.choice(user_comments_all, size=sample_size, replace=False))
    sim_score = user_comment_similarity(user_comments, vectorizer)
    print(f"User: {user_id}, Sampled {sample_size} comments, Similarity: {sim_score}")

User: RedBaronsBrother, Total comments: 22257
User: RedBaronsBrother, Sampled 1000 comments, Similarity: 0.047783727267191156


In [16]:
# Show users sorted by number of comments (high to low)
user_comment_counts = [(user, len(comments)) for user, comments in all_user_comments.items()]
user_comment_counts.sort(key=lambda x: x[1], reverse=True)

for user, count in user_comment_counts[:20]:  # Show top 20 users
    print(f"User: {user}, Number of comments: {count}")

User: [deleted], Number of comments: 668051
User: AutoModerator, Number of comments: 147269
User: RedBaronsBrother, Number of comments: 22257
User: VegaThePunisher, Number of comments: 21638
User: IBiteYou, Number of comments: 20391
User: keypuncher, Number of comments: 13895
User: backpackwayne, Number of comments: 11347
User: kopskey1, Number of comments: 9504
User: michaelconfoy, Number of comments: 6576
User: Tampammm, Number of comments: 6354
User: Gsteel11, Number of comments: 6286
User: kerryfinchelhillary, Number of comments: 6180
User: raistlin65, Number of comments: 4895
User: therecordcorrected, Number of comments: 4476
User: bokono, Number of comments: 4383
User: The_seph_i_am, Number of comments: 3676
User: dolphins3, Number of comments: 3668
User: wethedownvoted, Number of comments: 3192
User: Btravelen, Number of comments: 3180
User: walter1950, Number of comments: 2942


In [18]:
# Show users sorted by number of comments (high to low)
user_comment_counts = [(user, len(comments)) for user, comments in all_user_comments.items() if len(comments) >= 5]
user_comment_counts.sort(key=lambda x: x[1])

for user, count in user_comment_counts[:20]:  # Show top 20 users
    print(f"User: {user}, Number of comments: {count}")

User: mongrelized, Number of comments: 5
User: rapist666, Number of comments: 5
User: sanrabb, Number of comments: 5
User: ExoticKosher, Number of comments: 5
User: FisherOfMen, Number of comments: 5
User: jamesism, Number of comments: 5
User: wyboo1, Number of comments: 5
User: adlauren, Number of comments: 5
User: dreamweaver1984, Number of comments: 5
User: Logicator, Number of comments: 5
User: Morbidgrass, Number of comments: 5
User: engelk, Number of comments: 5
User: Rogue9162, Number of comments: 5
User: Noetic_Hatter, Number of comments: 5
User: LANshark, Number of comments: 5
User: yourmamasays, Number of comments: 5
User: seth556, Number of comments: 5
User: GeneralRobert, Number of comments: 5
User: frankyphillips, Number of comments: 5
User: Deracination, Number of comments: 5
