# k-Means Clustering for numerical features

In [21]:
import pandas as pd
import networkx as nx

In [22]:
df_posts = pd.read_json('../data/dataset.json')

In [23]:
df_posts['timestamp'] = pd.to_datetime(df_posts['timestamp'])

## Extract numeric features

### Temporal features

Number of posts over time

In [24]:
posts_per_user = df_posts.groupby('user')['timestamp'].count()

Time between consecutive posts

In [25]:
# Calculate the time difference between posts
df_posts['time_diff'] = df_posts.groupby('user')['timestamp'].diff()

# Convert time_diff to timedelta
df_posts['time_diff'] = pd.to_timedelta(df_posts['time_diff'], errors='coerce')

# Replace NaT with the median time difference
df_posts['time_diff'] = df_posts['time_diff'].fillna(df_posts['time_diff'].median())

In [26]:
# Convert time_diff to seconds
df_posts['time_diff_seconds'] = df_posts['time_diff'].dt.total_seconds()

# Drop the time_diff column
df_posts = df_posts.drop(columns=['time_diff'])

### Text-based features

Tweet length

In [27]:
df_posts['text_length'] = df_posts['text'].apply(len)

Word count

In [28]:
df_posts['word_count'] = df_posts['text'].apply(lambda x: len(x.split()))

Sentiment Analysis score

In [29]:
sentiment_df = pd.read_csv('../output/sentiment.csv')

sentiment_subset = sentiment_df[['text_id', 'sentiment_score']]

df_posts = pd.merge(df_posts, sentiment_subset, on='text_id', how='left')

Keyword count

Hashtag count

In [30]:
df_posts['hashtag_count'] = df_posts['text'].str.count('#')

Topics from BERTopic

In [31]:
topics_df = pd.read_csv('../output/posts_with_topics.csv')

# Select only 'text_id' and 'topic_label' from topics_df
topics_subset = topics_df[['text_id', 'topic_label']]

# Merge df_posts with the topics_subset on 'text_id'
df_posts = pd.merge(df_posts, topics_subset, on='text_id', how='left')

### Network Features

Follower count

In [32]:
# Load data
edges = pd.read_csv('../data/graph.csv')

# Create a graph
G = nx.from_pandas_edgelist(edges, source='source', target='target', edge_attr='weight')

# Calculate degrees
degrees = dict(G.degree())
degree_df = pd.DataFrame(degrees.items(), columns=['user', 'degree'])

Clustering Coefficient (how connected a user's neighbors are in the network)

In [33]:
clustering_coeff = nx.clustering(G)
clustering_df = pd.DataFrame(clustering_coeff.items(), columns=['user', 'clustering_coeff'])

Influence Score

In [34]:
pagerank = nx.pagerank(G)
pagerank_df = pd.DataFrame(pagerank.items(), columns=['user', 'pagerank'])

Betweenness Centrality

In [36]:
betweenness = nx.betweenness_centrality(G)
betweenness_df = pd.DataFrame(betweenness.items(), columns=['user', 'betweenness_centrality'])

KeyboardInterrupt: 

### Aggregate Features per User

Average Post Sentiment Per User

In [18]:
df_posts['avg_sentiment'] = df_posts.groupby('user')['sentiment_score'].transform('mean')

Variability in Posting Times

In [19]:
time_std = df_posts.groupby('user')['timestamp'].std().reset_index(name='time_std')

### Combined Features

Posting Behavior + Text Content: How does sentiment correlate with posting frequency or time?

Network + Text Analysis: Do influential users post more positive content?

## Normalize features

- Normalize derived numerical features for clustering 
- Use features for clustering(k-Means or hierarchical)