In [4]:
import pandas as pd
from tqdm import tqdm
import os

# === File paths ===
input_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_03.csv"
output_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_04.csv"
top_users_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\tweets_posts\high_volume_users.csv"

# === Ensure output directories exist ===
os.makedirs(os.path.dirname(output_path), exist_ok=True)
os.makedirs(os.path.dirname(top_users_path), exist_ok=True)

# === Load dataset ===
print("📥 Loading dataset...")
df = pd.read_csv(input_path)
original_count = len(df)
print(f"📊 Original tweet count: {original_count:,}")

# === Drop missing or empty text ===
print("🧹 Removing empty tweets...")
df = df.dropna(subset=["text"])
df = df[df["text"].str.strip().ne("")]

# === Calculate word count with tqdm progress ===
print("🔢 Calculating tweet lengths...")
tqdm.pandas()
df["text_length"] = df["text"].progress_apply(lambda x: len(str(x).split()))

# === Filter tweets with 10–50 words ===
print("🔍 Filtering tweets (10–50 words)...")
df = df[(df["text_length"] >= 10) & (df["text_length"] <= 50)]
filtered_count = len(df)
removed_count = original_count - filtered_count
removed_pct = removed_count / original_count * 100

# === Convert datetime if needed ===
df["datetime"] = pd.to_datetime(df["datetime"], utc=True)
df["date"] = df["datetime"].dt.date

# === Count tweets per user per day ===
print("📊 Counting tweets per user per day...")
tweet_volume = df.groupby(["username", "date"]).size().reset_index(name="tweets_per_day")

# === Top tweeting users by max tweets/day ===
top_users = tweet_volume.groupby("username")["tweets_per_day"].max().sort_values(ascending=False)
top_users = top_users.reset_index()
top_users.columns = ["username", "max_daily_tweets"]

# === Save results ===
top_users.to_csv(top_users_path, index=False)
df.to_csv(output_path, index=False)

# === Summary ===
print("\n📊 Tweet Count Summary")
print("---------------------------------")
print(f"Original tweets:   {original_count:,}")
print(f"After filtering:   {filtered_count:,}")
print(f"Removed tweets:    {removed_count:,} ({removed_pct:.2f}%)")
print("---------------------------------")
print(f"💾 Saved filtered tweets to: {output_path}")
print(f"📈 Saved top users to:       {top_users_path}")


📥 Loading dataset...
📊 Original tweet count: 13,660,364
🧹 Removing empty tweets...
🔢 Calculating tweet lengths...


100%|██████████| 13660364/13660364 [00:48<00:00, 284522.51it/s]


🔍 Filtering tweets (10–50 words)...
📊 Counting tweets per user per day...

📊 Tweet Count Summary
---------------------------------
Original tweets:   13,660,364
After filtering:   11,041,029
Removed tweets:    2,619,335 (19.17%)
---------------------------------
💾 Saved filtered tweets to: C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_04.csv
📈 Saved top users to:       C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\tweets_posts\high_volume_users.csv
