In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
import re

# Initialiser VADER
analyzer = SentimentIntensityAnalyzer()

# Udvidet kryptospecifikt leksikon
custom_crypto_lexicon = {
    # Markedssentiment - Meget positive (2.0-3.5)
    "bullish": 2.5, "mooning": 3.0, "to the moon": 3.0, "moon": 2.5, "ath": 2.5, "all time high": 2.5,
    "bullrun": 2.8, "moonshot": 3.0, "lambo": 2.5, "lambo time": 3.0, "diamond hands": 2.0,
    "hodl": 2.0, "pump": 1.8, "rally": 2.0, "surge": 2.3, "soar": 2.5, "rocket": 2.5,
    "breakthrough": 2.0, "breakout": 2.0, "explosion": 2.0, "explode": 2.0, "skyrocket": 3.0,
    "outperform": 2.0, "outperforming": 2.0, "gainz": 2.5, "gains": 2.0, "profit": 2.0, "profitable": 2.0,
    "winner": 2.0, "winning": 2.0, "victory": 2.0, "success": 1.8, "successful": 1.8,
    
    # Markedssentiment - Moderat positive (1.0-1.9)
    "buy": 1.5, "buying": 1.5, "bought": 1.0, "accumulate": 1.2, "accumulation": 1.2,
    "long": 1.0, "green": 1.5, "support": 1.0, "supported": 1.0, "supports": 1.0,
    "strong": 1.5, "strength": 1.5, "strengthen": 1.5, "strengthening": 1.5,
    "opportunity": 1.5, "potential": 1.0, "promising": 1.5, "rise": 1.5, "rising": 1.5,
    "gain": 1.5, "gaining": 1.5, "growth": 1.5, "growing": 1.5, "grow": 1.5,
    "rebound": 1.5, "recovery": 1.2, "recover": 1.2, "recovering": 1.2, "comeback": 1.5,
    "institutional": 1.0, "adoption": 1.5, "adopt": 1.2, "adopting": 1.2, "mainstream": 1.5,
    "invest": 1.0, "investment": 1.0, "investing": 1.0, "investor": 0.8, "investors": 0.8,
    "whale": 1.0, "whales": 1.0, "hold": 0.8, "holding": 0.8, "holder": 0.8, "holders": 0.8,
    "optimism": 1.5, "optimistic": 1.5, "confidence": 1.2, "confident": 1.2,
    "future": 0.5, "progress": 1.0, "progressive": 1.0, "innovate": 1.0, "innovation": 1.0,
    "upgrade": 1.2, "upgraded": 1.2, "update": 0.8, "updated": 0.8, "updates": 0.8,
    
    # Tekniske indikatorer - Positive
    "golden cross": 2.0, "oversold": 1.5, "bottom": 1.0, "bottomed": 1.0, "bottoming": 1.0,
    "undervalued": 1.5, "discount": 1.0, "discounted": 1.0, "bargain": 1.2,
    "consolidation": 0.8, "consolidating": 0.8, "consolidate": 0.8, "stable": 0.5, "stability": 0.5,
    
    # Regulatoriske - Positive
    "legal": 1.0, "legalization": 1.2, "approved": 1.5, "approval": 1.5, "etf": 1.0,
    "adoption": 1.5, "accept": 1.0, "accepted": 1.0, "accepting": 1.0, "regulation": 0.5,  # Kan være både pos/neg
    
    # Markedssentiment - Meget negative (-2.0 til -3.5)
    "bearish": -2.5, "bear": -2.0, "bear market": -2.5, "crash": -3.0, "crashed": -3.0, "crashing": -3.0,
    "dump": -2.5, "dumping": -2.5, "dumped": -2.5, "rugpull": -3.5, "rug pull": -3.5, "scam": -3.0,
    "scammed": -3.0, "ponzi": -3.0, "fraud": -3.0, "fraudulent": -3.0, "rekt": -3.0, "wrecked": -3.0,
    "collapse": -2.8, "collapsed": -2.8, "collapsing": -2.8, "plummet": -3.0, "plummeting": -3.0,
    "tank": -2.5, "tanking": -2.5, "tanked": -2.5, "disaster": -2.5, "catastrophe": -3.0,
    "meltdown": -2.8, "bubble": -2.0, "burst": -2.5, "bursting": -2.5, "capitulation": -2.5,
    
    # Markedssentiment - Moderat negative (-1.0 til -1.9)
    "sell": -1.5, "selling": -1.5, "sold": -1.2, "short": -1.5, "shorting": -1.5, "red": -1.2,
    "loss": -1.8, "losses": -1.8, "losing": -1.8, "lost": -1.5, "loser": -1.5, "lose": -1.5,
    "drop": -1.5, "dropping": -1.5, "dropped": -1.5, "fall": -1.5, "falling": -1.5, "fell": -1.5,
    "decline": -1.5, "declining": -1.5, "declined": -1.5, "decrease": -1.2, "decreasing": -1.2,
    "weak": -1.2, "weakness": -1.2, "weaken": -1.2, "weakening": -1.2, "pressure": -1.0,
    "correction": -1.5, "correcting": -1.5, "fear": -1.8, "fearful": -1.8, "panic": -2.0, "panicking": -2.0,
    "problem": -1.0, "problematic": -1.0, "trouble": -1.5, "worried": -1.5, "worry": -1.5, "concerning": -1.2,
    "risk": -1.0, "risky": -1.2, "danger": -1.5, "dangerous": -1.5, "unstable": -1.2, "instability": -1.2,
    "volatile": -1.0, "volatility": -1.0, "uncertainty": -1.5, "uncertain": -1.5, 
    
    # Tekniske indikatorer - Negative
    "resistance": -1.0, "overbought": -1.5, "death cross": -2.0, "top": -1.0, "topped": -1.0, "topping": -1.0,
    "overvalued": -1.5, "expensive": -1.0, "fib": -0.5,  # Fibonacci ofte brugt i bearish kontekst
    
    # Regulatoriske - Negative
    "ban": -2.5, "banned": -2.5, "banning": -2.5, "illegal": -2.0, "prohibited": -2.0,
    "crackdown": -2.0, "sec": -0.8, "lawsuit": -1.8, "sue": -1.8, "investigation": -1.5,
    
    # Krypto-specifikke udtryk
    "fud": -1.8,  # Fear, Uncertainty, Doubt
    "dyor": 0.5,  # Do Your Own Research (svagt positiv)
    "btfd": 1.5,  # Buy The F*** Dip (positiv)
    "ngmi": -1.5,  # Not Gonna Make It (negativ)
    "wagmi": 2.0,  # We're All Gonna Make It (positiv)
    "shitcoin": -2.0, 
    "altcoin": 0.2,  # Neutral-svagt positiv
    "defi": 1.0,  # Decentralized Finance (positiv)
    "nft": 0.8,
    "dao": 0.8,
    "stablecoin": 0.5,
    "smart contract": 1.0,
    "airdrop": 1.5,
    "staking": 1.2,
    "yield": 1.0,
    "yield farming": 1.0,
    "liquidity": 0.8,
    "whitepaper": 0.5,
    "ico": 0.0,  # Neutral (kan være både positiv/negativ)
    "ieo": 0.5,
    "wallet": 0.3,
    "exchange": 0.2,
    "halving": 1.5,
    "hash rate": 0.5,
    "miner": 0.3,
    "mining": 0.3,
}

# Opdater VADER-lexicon
analyzer.lexicon.update(custom_crypto_lexicon)

# Udvidede dynamiske kombinationer (ordpar inden for 4 ord)
positive_pairs = [
    # Bullish kombinationer
    ("buy", "bitcoin"), ("buy", "btc"), ("buy", "crypto"), ("buy", "dip"), ("buying", "opportunity"),
    ("bullish", "bitcoin"), ("bullish", "btc"), ("bullish", "crypto"), ("bullish", "market"), ("bullish", "trend"),
    ("moon", "soon"), ("to", "moon"), ("going", "moon"), ("price", "moon"), ("bitcoin", "moon"),
    ("hold", "bitcoin"), ("hold", "btc"), ("hodl", "btc"), ("hodl", "bitcoin"), ("diamond", "hands"),
    ("strong", "support"), ("higher", "high"), ("higher", "low"), ("good", "news"), ("great", "news"),
    ("long", "term"), ("golden", "cross"), ("bottom", "in"), ("price", "target"), ("price", "prediction"),
    ("next", "bull"), ("bull", "run"), ("bull", "market"), ("bull", "cycle"), ("bitcoin", "etf"),
    ("crypto", "adoption"), ("institutional", "investment"), ("institutional", "investor"), ("whale", "buying"),
    ("breakout", "resistance"), ("btc", "dominance"), ("massive", "gain"), ("huge", "gain"), ("record", "high"),
    
    # Tekniske og fundamentale positive kombinationer
    ("oversold", "condition"), ("buy", "zone"), ("accumulation", "zone"), ("forming", "bottom"),
    ("double", "bottom"), ("bullish", "divergence"), ("bullish", "pattern"), ("bullish", "flag"),
    ("ascending", "triangle"), ("higher", "timeframe"), ("long", "position"), ("green", "candle"),
    ("strong", "fundamental"), ("good", "project"), ("solid", "fundamental"), ("real", "use"),
    ("great", "team"), ("strong", "community"), ("important", "update"), ("successful", "upgrade"),
    ("new", "partnership"), ("new", "listing"), ("new", "exchange"), ("great", "opportunity"),
]

negative_pairs = [
    # Bearish kombinationer
    ("sell", "bitcoin"), ("sell", "btc"), ("sell", "crypto"), ("selling", "pressure"), ("time", "sell"),
    ("bearish", "bitcoin"), ("bearish", "btc"), ("bearish", "crypto"), ("bearish", "market"), ("bearish", "trend"),
    ("price", "crash"), ("market", "crash"), ("bitcoin", "crash"), ("crypto", "crash"), ("market", "dump"),
    ("bitcoin", "dump"), ("crypto", "dump"), ("price", "dump"), ("going", "zero"), ("bubble", "burst"),
    ("lower", "low"), ("lower", "high"), ("bad", "news"), ("negative", "news"), ("death", "cross"),
    ("bear", "market"), ("bear", "trend"), ("bear", "trap"), ("bull", "trap"), ("fake", "rally"),
    ("distribution", "pattern"), ("head", "shoulders"), ("double", "top"), ("resistance", "rejected"),
    ("failed", "breakout"), ("market", "manipulation"), ("price", "manipulation"), ("pump", "dump"),
    
    # Regulatoriske og risikofyldte negative kombinationer
    ("sec", "lawsuit"), ("sec", "investigation"), ("regulatory", "concern"), ("government", "ban"),
    ("china", "ban"), ("exchange", "hack"), ("wallet", "hack"), ("security", "breach"),
    ("scam", "project"), ("scam", "coin"), ("ponzi", "scheme"), ("fraud", "project"), ("rug", "pull"),
    ("high", "risk"), ("too", "risky"), ("stay", "away"), ("losing", "money"), ("lost", "money"),
    ("avoid", "investment"), ("not", "sustainable"), ("bubble", "pop"), ("market", "correction"),
    ("severe", "correction"), ("panic", "sell"), ("fear", "uncertainty"), ("massive", "selloff"),
    ("whales", "dumping"), ("miners", "selling"), ("forced", "liquidation"), ("margin", "call")
]

# Funktion til at finde dynamiske kombinationer med udvidet vindue (4 ord)
def check_dynamic_pairs(tokens):
    score = 0
    window_size = 4  # Udvidet fra 3 til 4 ord
    
    for i, word in enumerate(tokens):
        for j in range(1, window_size + 1):  # Tjekker inden for 4 ord
            if i + j < len(tokens):
                pair = (word, tokens[i + j])
                reverse_pair = (tokens[i + j], word)
                
                # Tjek positive par
                if pair in positive_pairs or reverse_pair in positive_pairs:
                    score += 2.0  # Positivt boost
                # Tjek negative par
                elif pair in negative_pairs or reverse_pair in negative_pairs:
                    score -= 2.0  # Negativt boost
    
    return score

# Tokenisering funktion (fjernet emoji-håndtering)
def tokenize_text(text):
    # Konverter til lowercase og tokeniser
    text = str(text).lower()
    
    # Find almindelige ord
    words = re.findall(r'\b\w+\b', text)
    
    return words

# Funktion til at beregne sentiment med tilpasset VADER
def calculate_crypto_sentiment(text):
    if pd.isna(text) or text == "":
        return 0.0  # Håndter tomme eller NaN værdier
    
    tokens = tokenize_text(text)
    vader_score = analyzer.polarity_scores(text)['compound']  # Bruger compound scoren
    dynamic_score = check_dynamic_pairs(tokens)
    
    # Kombiner VADER-score med dynamisk score
    final_score = vader_score + (dynamic_score * 0.1)  # Reducer vægten af dynamic_score for at undgå overboost
    
    # Juster, så scoren ligger inden for [-1, 1]
    final_score = max(min(final_score, 1), -1)
    
    return final_score

# Filstier
input_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_17_vader.csv"
output_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_18_vadercrypto.csv"

# Indlæs data
print("Indlæser data...")
data = pd.read_csv(input_path)

# Anvend den nye sentimentfunktion
print("Beregner kryptosentiment...")
tqdm.pandas(desc="Beregner kryptosentiment")
data['crypto_sentiment_score'] = data['text_cleaned'].progress_apply(calculate_crypto_sentiment)

# Gem filen med den nye sentiment score
print("Gemmer resultater...")
data.to_csv(output_path, index=False)
print(f"✅ Fil med crypto sentiment score gemt som: {output_path}")

In [None]:
import pandas as pd

# Filstier
input_path = r"c:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_20_vadercrypto.csv"
output_path = r"c:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_20_vadercrypto_14month.csv"

# Indlæs data
print("Indlæser data...")
data = pd.read_csv(input_path)

# Konverter 'datetime' kolonnen til datetime-format og fjern tidszonen
data['datetime'] = pd.to_datetime(data['datetime']).dt.tz_convert(None)

# Filtrer tweets fra starten til og med 2022-02-28
cutoff_date = pd.Timestamp("2022-02-28 23:59:59")
filtered_data = data[data['datetime'] <= cutoff_date]

# Gem det filtrerede datasæt
filtered_data.to_csv(output_path, index=False)
print(f"✅ Filtreret data gemt som: {output_path}")
