In [None]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

df1 = pd.read_csv(r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin-tweets-2021.csv", 
                  lineterminator='\n', parse_dates=['datetime'])

df2 = pd.read_csv(r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin-tweets-2022.csv", 
                  lineterminator='\n', parse_dates=['datetime'])

# Merge and sort
tweets = pd.concat([df1, df2.drop(columns=['date'])], axis=0).sort_values(by='datetime').reset_index(drop=True)
del df1, df2
print(f"✅ Loaded {len(tweets):,} tweets.")

tweets = tweets.groupby('username').filter(lambda x: len(x) < 4000)
print(f"✅ After removing spammy users: {len(tweets):,}")


noise_words = [
    # 🎁 Giveaways, Freebies, Scams
    'freebitcoin', 'freeminingsoftware', 'freebitco', 'free bitcoin', 'free airdrop', 'airdrops?',
    'earnbitcoin', 'earn bitcoin', 'claim now', 'bonus', 'lottery', 'double your',
    'investment opportunity', 'financial freedom', 'limited time offer', 'win.*btc', 'win.*crypto',
    'easy money', 'risk[- ]?free', 'guaranteed returns', 'make money', 'makemoney', 'passive income',

    # 🛠️ Promo tools, tokens, betting, platforms
    'faucet', 'casino', 'bet', 'sportsbook', 'nitrogensportsbook', 'footballcoin', 'trading bot',
    'tradingtool', 'trading tool', 'webbot', 'cloud mining', 'crypto mining', 'mining platform',
    'launchpad', 'presale', 'ido', 'pre[- ]?sale', 'simplefx', 'mpgvip',

    # 📈 Hype terms
    'moonshot', 'moon.*coin', 'moon soon', '100x', 'signal', 'alert',
    'hodl to the moon', 'load up', 'entry point', 'insane gains',

    # 🔗 Engagement bait
    'referral', 'retweet', 'share', 'join now', 'click here', 'check link', 'link in bio',
    'follow to win', 'dm me', 'telegram', 'discord', 'invite only', 'join telegram',

    # 🌐 Crypto scam language
    'airdrop', 'nftdrop', 'giveawaybot', 'bscgem', 'pinksale', 'degen', 'scam alert',
    'pre-mine', 'pump group', 'trading group', 'private sale', 'exclusive invite',
    'project x', 'meme coin', 'shitcoin', 'rugpull'
]

# Compile regex pattern for performance and accuracy
compiled_pattern = re.compile('|'.join(noise_words), flags=re.IGNORECASE)

# Apply filter with progress bar
print("🚀 Filtering tweets for noise words (this may take a few minutes)...")
mask = ~tweets['text'].progress_apply(lambda x: bool(compiled_pattern.search(str(x))))
tweets = tweets[mask]

print(f"✅ After filtering noise tweets: {len(tweets):,}")

import matplotlib.pyplot as plt

print("📏 Calculating word counts...")

# Count number of words per tweet
tweets['word_count'] = tweets['text'].progress_apply(lambda x: len(str(x).split()))


# 📊 Plot BEFORE filtering
plt.figure(figsize=(10, 5))
tweets['word_count'].hist(bins=60, color='skyblue', edgecolor='black')
plt.title("Histogram of tweet word counts (before filtering)")
plt.xlabel("Number of words")
plt.ylabel("Tweet count")
plt.grid(True)
plt.tight_layout()
plt.show()

# ✅ Apply length filter (5 to 60 words)
tweets = tweets[tweets['word_count'].between(5, 60)]

print(f"✅ After filtering by length: {len(tweets):,}")

# 📊 Plot AFTER filtering (optional but useful)
plt.figure(figsize=(10, 5))
tweets['word_count'].hist(bins=40, color='lightgreen', edgecolor='black')
plt.title("Histogram of tweet word counts (after filtering)")
plt.xlabel("Number of words")
plt.ylabel("Tweet count")
plt.grid(True)
plt.tight_layout()
plt.show()

import regex as re

print("🧼 Cleaning tweets using GloVe-style method (this will take time)...")

FLAGS = re.MULTILINE | re.DOTALL

def clean_tweet_glove(text):
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", str(text), flags=FLAGS)  # URLs
    text = re.sub(r"@\w+", "", text, flags=FLAGS)  # Mentions
    text = re.sub(r"/", " / ", text, flags=FLAGS)
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", text, flags=FLAGS)  # Numbers
    text = re.sub(r"#", "", text, flags=FLAGS)  # Hashtags
    text = re.sub(r"([!?.]){2,}", r"\1 ", text, flags=FLAGS)  # Repeated punct
    text = re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 ", text, flags=FLAGS)  # Elongated
    text = re.sub(r"\$+([\w_]+[\w\'_\-]*[\w_]+)", r"\1", text, flags=FLAGS)  # Cashtags
    text = re.sub(r"\(([a-zA-Z<>]+)\)", r"( \1 )", text, flags=FLAGS)
    text = re.sub(r"\n+", " ", text, flags=FLAGS)
    text = re.sub(r"&amp;", "", text, flags=FLAGS)
    text = re.sub(r"  ", " ", text, flags=FLAGS)
    return text.lower().strip()

# Apply to tweets
tweets["text_clean"] = tweets["text"].progress_apply(clean_tweet_glove)

print(" GloVe-style text cleaning complete.")

print("🧹 Removing duplicate tweets...")

tweets = tweets.drop_duplicates(subset="text_clean").reset_index(drop=True)

print(f"✅ After removing duplicates: {len(tweets):,}")

output_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned.csv"
tweets.to_csv(output_path, index=False)

print(f"📁 Cleaned dataset saved to:\n{output_path}")

