In [7]:
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')


[nltk_data] Downloading package stopwords to C:\Users\Kasper
[nltk_data]     Hassing\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Kasper
[nltk_data]     Hassing\AppData\Roaming\nltk_data...


True

In [10]:
import pandas as pd
from collections import Counter
from tqdm import tqdm
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Indlæs en mindre sample først (fx 100.000 rækker for hurtig test)
file_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_16_vader.csv"
data = pd.read_csv(file_path, nrows=8000000)  # Begrænset til 100.000 for hurtigere test

# Initialiser VADER
analyzer = SentimentIntensityAnalyzer()

# Brug Pandas direkte til tokenisering (hurtigere end NLTK)
stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower())
    return [t for t in tokens if t not in stop_words]

# Saml statistik effektivt
all_words = Counter()
zero_score_words = Counter()
unrecognized_words = Counter()
positive_words = Counter()
negative_words = Counter()

# Drop NaN-værdier før iterering for at spare tid
data = data.dropna(subset=['text_cleaned', 'vader_sentiment_score'])

# Brug Pandas til direkte gruppering af scores
positive_count = len(data[data['vader_sentiment_score'] > 0])
negative_count = len(data[data['vader_sentiment_score'] < 0])
neutral_count = len(data[data['vader_sentiment_score'] == 0])
mean_score = data['vader_sentiment_score'].mean()
median_score = data['vader_sentiment_score'].median()

# Processér tweets
for text, score in tqdm(zip(data['text_cleaned'], data['vader_sentiment_score']), total=len(data), desc="Analyserer tweets"):
    tokens = tokenize(text)
    all_words.update(tokens)

    if score == 0.0:
        zero_score_words.update(tokens)
    elif score > 0:
        positive_words.update(tokens)
    elif score < 0:
        negative_words.update(tokens)

# Udskriv resultater
print(f"\n=== Sentimentstatistikker ===")
print(f"Gennemsnitlig VADER-score: {mean_score:.4f}")
print(f"Median VADER-score: {median_score:.4f}")
print(f"Antal positive tweets: {positive_count}")
print(f"Antal negative tweets: {negative_count}")
print(f"Antal neutrale tweets: {neutral_count}")

print(f"\n=== Hyppigste ord i alle tweets ===")
for word, freq in all_words.most_common(20):
    print(f"{word}: {freq}")

print(f"\n=== Hyppigste ord i tweets med score 0.0 ===")
for word, freq in zero_score_words.most_common(20):
    print(f"{word}: {freq}")

print(f"\n=== Hyppigste positive ord ===")
for word, freq in positive_words.most_common(20):
    print(f"{word}: {freq}")

print(f"\n=== Hyppigste negative ord ===")
for word, freq in negative_words.most_common(20):
    print(f"{word}: {freq}")


Analyserer tweets: 100%|██████████| 7853881/7853881 [01:33<00:00, 83729.01it/s]



=== Sentimentstatistikker ===
Gennemsnitlig VADER-score: 0.1193
Median VADER-score: 0.0000
Antal positive tweets: 3667190
Antal negative tweets: 1945961
Antal neutrale tweets: 2240730

=== Hyppigste ord i alle tweets ===
bitcoin: 8321574
crypto: 1202014
btc: 1145606
cryptocurrency: 582894
k: 539816
ethereum: 493780
like: 476330
buy: 468639
dont: 389732
eth: 361913
money: 349913
price: 349407
people: 348127
time: 340409
one: 324418
get: 319272
market: 317350
blockchain: 306759
would: 275108
new: 268604

=== Hyppigste ord i tweets med score 0.0 ===
bitcoin: 2392723
btc: 407436
crypto: 400951
cryptocurrency: 209325
ethereum: 185804
k: 176597
buy: 148092
eth: 138320
price: 118428
blockchain: 106431
market: 92239
time: 91891
new: 90749
via: 83895
money: 79638
dont: 75232
get: 75103
going: 74334
nft: 72857
think: 71722

=== Hyppigste positive ord ===
bitcoin: 3870064
crypto: 546052
btc: 491894
like: 384778
cryptocurrency: 268929
k: 224278
ethereum: 222507
good: 210175
buy: 208344
dont: 1892

In [12]:
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Indlæs datasættet
file_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_16_vader.csv"
data = pd.read_csv(file_path).sample(n=100_000, random_state=42)


# Initialiser VADER
analyzer = SentimentIntensityAnalyzer()

# Funktion til tokenisering
def tokenize(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower())
    return [t for t in tokens if t not in stopwords.words('english')]

# Ord-sentiment opsamling
word_scores = defaultdict(list)
neutral_words = Counter()

# Progress bar
for _, row in tqdm(data.iterrows(), total=len(data), desc="Analyserer tweets"):
    tokens = tokenize(row['text_cleaned'])
    score = row['vader_sentiment_score']
    for token in tokens:
        word_scores[token].append(score)
        if score == 0.0:
            neutral_words[token] += 1

# Beregn gennemsnitlig VADER-score for hvert ord
word_avg_scores = {word: sum(scores) / len(scores) for word, scores in word_scores.items()}
word_count = {word: len(scores) for word, scores in word_scores.items()}

# Skab DataFrame for sentiment-weighted ordscore
df_sentiment_weighted = pd.DataFrame.from_dict(word_avg_scores, orient='index', columns=['Avg_VADER_Score'])
df_sentiment_weighted['Count'] = df_sentiment_weighted.index.map(word_count)

# Sortér efter mest positive og mest negative
df_positive = df_sentiment_weighted[df_sentiment_weighted['Avg_VADER_Score'] > 0].sort_values(by='Avg_VADER_Score', ascending=False).head(20)
df_negative = df_sentiment_weighted[df_sentiment_weighted['Avg_VADER_Score'] < 0].sort_values(by='Avg_VADER_Score').head(20)

# Hyppige neutrale ord
df_neutral = pd.DataFrame(neutral_words.most_common(20), columns=['Word', 'Frequency'])

# Udskriv resultater
print("\n=== Mest positive ord (gennemsnitlig score) ===")
print(df_positive)

print("\n=== Mest negative ord (gennemsnitlig score) ===")
print(df_negative)

print("\n=== Mest hyppige ord i neutrale tweets (score 0.0) ===")
print(df_neutral)


Analyserer tweets: 100%|██████████| 100000/100000 [07:27<00:00, 223.53it/s]



=== Mest positive ord (gennemsnitlig score) ===
                                   Avg_VADER_Score  Count
ygifgsnngteogbvhxsvwlzgorbh                 0.9853      1
wellbe                                      0.9808      1
safelambo                                   0.9783      1
kewl                                        0.9772      1
bcqjwwhmjedcxtllxmlntkdvfjdghkwep           0.9763      1
waytomoon                                   0.9763      1
ldfi                                        0.9763      1
jobi                                        0.9758      1
seebsc                                      0.9758      1
qcbemdaqsmndeabnjhplomokxjzfs               0.9758      1
sooni                                       0.9758      1
moneyim                                     0.9758      1
nonadjusted                                 0.9753      1
grogu                                       0.9753      1
xdceaeecadae                                0.9738      1
twittwr                

In [15]:
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Indlæs datasættet
file_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_16_vader.csv"
data = pd.read_csv(file_path).sample(n=100_000, random_state=42)

# Initialiser VADER
analyzer = SentimentIntensityAnalyzer()

# Funktion til tokenisering
def tokenize(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower())
    return [t for t in tokens if t not in stopwords.words('english')]

# Saml statistik
word_scores = defaultdict(list)
neutral_words = Counter()
combined_phrases = Counter()

# Progress bar
for _, row in tqdm(data.iterrows(), total=len(data), desc="Analyserer tweets"):
    tokens = tokenize(row['text_cleaned'])
    score = row['vader_sentiment_score']

    for i, token in enumerate(tokens):
        word_scores[token].append(score)

        # Kombination af ordpar med "bitcoin"
        if token == "bitcoin":
            if i > 0:  # Ord før "bitcoin"
                phrase = f"{tokens[i-1]} bitcoin"
                combined_phrases[phrase] += 1
                word_scores[phrase].append(score)
            if i < len(tokens) - 1:  # Ord efter "bitcoin"
                phrase = f"bitcoin {tokens[i+1]}"
                combined_phrases[phrase] += 1
                word_scores[phrase].append(score)

        # Neutrale ord
        if score == 0.0:
            neutral_words[token] += 1

# Beregn gennemsnitlig VADER-score for hvert ord
word_avg_scores = {word: sum(scores) / len(scores) for word, scores in word_scores.items()}
word_count = {word: len(scores) for word, scores in word_scores.items()}

# Ufiltreret ordliste
df_sentiment_weighted_all = pd.DataFrame.from_dict(word_avg_scores, orient='index', columns=['Avg_VADER_Score'])
df_sentiment_weighted_all['Count'] = df_sentiment_weighted_all.index.map(word_count)

# Filtreret ordliste (mindst 100 forekomster)
filtered_scores = {word: score for word, score in word_avg_scores.items() if word_count[word] >= 100}
df_sentiment_weighted_filtered = pd.DataFrame.from_dict(filtered_scores, orient='index', columns=['Avg_VADER_Score'])
df_sentiment_weighted_filtered['Count'] = df_sentiment_weighted_filtered.index.map(word_count)

# Sortér begge efter mest positive og mest negative
df_positive_all = df_sentiment_weighted_all[df_sentiment_weighted_all['Avg_VADER_Score'] > 0].sort_values(by='Avg_VADER_Score', ascending=False).head(20)
df_negative_all = df_sentiment_weighted_all[df_sentiment_weighted_all['Avg_VADER_Score'] < 0].sort_values(by='Avg_VADER_Score').head(20)

df_positive_filtered = df_sentiment_weighted_filtered[df_sentiment_weighted_filtered['Avg_VADER_Score'] > 0].sort_values(by='Avg_VADER_Score', ascending=False).head(20)
df_negative_filtered = df_sentiment_weighted_filtered[df_sentiment_weighted_filtered['Avg_VADER_Score'] < 0].sort_values(by='Avg_VADER_Score').head(20)

# Mest brugte kombinerede bitcoin-ord
df_combined = pd.DataFrame(combined_phrases.most_common(20), columns=['Phrase', 'Frequency'])

# Udskriv resultater
print("\n=== Mest positive ord (uanset frekvens) ===")
print(df_positive_all)

print("\n=== Mest positive ord (mindst 100 forekomster) ===")
print(df_positive_filtered)

print("\n=== Mest negative ord (uanset frekvens) ===")
print(df_negative_all)

print("\n=== Mest negative ord (mindst 100 forekomster) ===")
print(df_negative_filtered)

print("\n=== Mest brugte kombinerede bitcoin-ord ===")
print(df_combined)


Analyserer tweets: 100%|██████████| 100000/100000 [07:13<00:00, 230.65it/s]



=== Mest positive ord (uanset frekvens) ===
                                           Avg_VADER_Score  Count
ygifgsnngteogbvhxsvwlzgorbh                         0.9853      1
wellbe                                              0.9808      1
safelambo                                           0.9783      1
kewl                                                0.9772      1
bitcoin bcqjwwhmjedcxtllxmlntkdvfjdghkwep           0.9763      1
ldfi                                                0.9763      1
waytomoon                                           0.9763      1
bcqjwwhmjedcxtllxmlntkdvfjdghkwep                   0.9763      1
bitcoin ldfi                                        0.9763      1
qcbemdaqsmndeabnjhplomokxjzfs                       0.9758      1
bitcoin loveyou                                     0.9758      1
moneyim                                             0.9758      1
sooni                                               0.9758      1
jobi                           

In [17]:
import pandas as pd
from tqdm import tqdm
import re

# Oprindelig og ny filsti
input_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_16_vader.csv"
output_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_17_vader.csv"

# Indlæs data
data = pd.read_csv(input_path)

# Valideringsfunktion (opdateret til max 15 tegn pr. ord)
def is_valid_tweet(text):
    tokens = re.findall(r'\b\w+\b', str(text).lower())
    if any(len(word) > 15 for word in tokens):
        return False
    if len(tokens) < 10:
        return False
    return True

# Anvend filtrering med progress bar
tqdm.pandas(desc="Renser tweets (max 15 tegn)")
cleaned_data = data[data['text_cleaned'].progress_apply(is_valid_tweet)]

# Statistik
original_count = len(data)
cleaned_count = len(cleaned_data)
removed_count = original_count - cleaned_count

print(f"Antal tweets før rensning: {original_count}")
print(f"Antal tweets efter rensning: {cleaned_count}")
print(f"Antal fjernede tweets: {removed_count}")

# Gem til ny fil
cleaned_data.to_csv(output_path, index=False)
print(f"✅ Renset fil gemt som: {output_path}")


Renser tweets (max 15 tegn): 100%|██████████| 7853881/7853881 [01:02<00:00, 126514.32it/s]


Antal tweets før rensning: 7853881
Antal tweets efter rensning: 7102781
Antal fjernede tweets: 751100
✅ Renset fil gemt som: C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_17_vader.csv


In [19]:
import pandas as pd
import re
from tqdm import tqdm

# Emoji regex
emoji_pattern = re.compile(
    r"[\U0001F600-\U0001F64F]|"  # Emoticons
    r"[\U0001F300-\U0001F5FF]|"  # Symboler og piktogrammer
    r"[\U0001F680-\U0001F6FF]|"  # Transport og kortsymboler
    r"[\U0001F700-\U0001F77F]|"  # Diverse symboler
    r"[\U0001F780-\U0001F7FF]|"  # Geometriske symboler
    r"[\U0001F800-\U0001F8FF]|"  # Supplerende symboler
    r"[\U0001F900-\U0001F9FF]|"  # Ansigts-symboler og kropsdele
    r"[\U0001FA00-\U0001FA6F]|"  # Objekter og ting
    r"[\U0001FA70-\U0001FAFF]|"  # Diverse symboler og piktogrammer
    r"[\U00002702-\U000027B0]|"  # Andre symboler og pile
    r"[\U000024C2-\U0001F251]",  # Diverse symboler og pile
    flags=re.UNICODE
)

# Funktion til at tjekke for emojis
def contains_emoji(text):
    return bool(emoji_pattern.search(str(text)))

# Filsti til data
file_path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_17_vader.csv"

# Indlæs data
data = pd.read_csv(file_path)

# Anvend emoji-tjek
tqdm.pandas(desc="Tjekker for emojis")
data['contains_emoji'] = data['text_cleaned'].progress_apply(contains_emoji)

# Udskriv resultat
emoji_count = data['contains_emoji'].sum()
total_count = len(data)
print(f"Antal tweets med emojis: {emoji_count}")
print(f"Samlet antal tweets: {total_count}")
print(f"Andel tweets med emojis: {emoji_count / total_count:.2%}")


Tjekker for emojis: 100%|██████████| 7102781/7102781 [00:26<00:00, 270943.61it/s]

Antal tweets med emojis: 0
Samlet antal tweets: 7102781
Andel tweets med emojis: 0.00%



