In [None]:
import pandas as pd
import re
from collections import Counter
from tqdm import tqdm

path = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_06.csv"

df = pd.read_csv(path)
print(f"Antal tweets før filtrering: {len(df)}")

# === DEFINÉR FILTERFUNKTIONER ===
def has_few_unique_words(text, threshold=5):
    words = set(re.findall(r'\b\w+\b', str(text).lower()))
    return len(words) < threshold

def has_low_lexical_diversity(text, threshold=0.4):
    words = re.findall(r'\b\w+\b', str(text).lower())
    return not words or (len(set(words)) / len(words)) <= threshold

def has_repeated_words(text, max_repeats=3):
    words = re.findall(r'\b\w+\b', str(text).lower())
    return any(count > max_repeats for count in Counter(words).values())

def is_only_links_and_mentions(text):
    cleaned = re.sub(r"(https?://\S+|@\w+)", "", str(text)).strip()
    return cleaned == ""

# === FILTRERING MED PROGRESS BAR OG PROCENT ===
filtered_rows = []
total = len(df)
# Tilpas bar_format til at vise procent med 2 decimaler
bar_fmt = '{l_bar}{bar} | {n_fmt}/{total_fmt} [{percentage:.2f}%]'

for _, row in tqdm(df.iterrows(),
                   total=total,
                   desc="Filtrerer tweets",
                   bar_format=bar_fmt,
                   ncols=80):
    text = row['text']
    if (
        not has_few_unique_words(text)
        and not has_low_lexical_diversity(text)
        and not has_repeated_words(text)
        and not is_only_links_and_mentions(text)
    ):
        filtered_rows.append(row)

filtered_df = pd.DataFrame(filtered_rows)

# === GEMMER FILTRERET DATA ===
output_path = path.replace(".csv", "_filtered.csv")
filtered_df.to_csv(output_path, index=False)

# === RESULTAT ===
print(f"\nAntal tweets efter filtrering: {len(filtered_df)}")
print(f"Filtreret data gemt til: {output_path}")


Antal tweets før filtrering: 10864787


Filtrerer tweets: 100%|███████████████████████████ | 10864787/10864787 [100.00%]



Antal tweets efter filtrering: 10102567
Filtreret data gemt til: C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\twitter_posts\bitcoin_tweets_cleaned_06_filtered.csv
