- Just run second block of code...

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r"http[s]?://\S+", "", text)  # drop http (they are just ad)
    text = re.sub(r"www\.\S+", "", text)
    text = re.sub(r"@\w+", "", text)  # drop @
    text = re.sub(r"#", "", text)  # keep hashtag
    text = re.sub(r"[^\w\s]", "", text)  # drop punctuation
    text = re.sub(r"\s+", " ", text).strip()  # drop unnecessary blank
    return text

df = pd.read_csv("Nvidia-tweets.csv")
df["clean_tweet"] = df["Text"].apply(clean_tweet)
df["Datetime"] = df["Datetime"].astype(str).str.slice(0, 10)

df.to_csv("nvidia_tweets_cleaned.csv", index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\81065\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\81065\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


- Most tweets are just spam, ads...so build a simple filter to get rid of them

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# keyword filter
advertising_keywords = [
    "learn trading", "get alerts", "get ideas", "get updates", "get analysis",
    "welcome to discord", "welcome to the discord group", "bitcoin", "crypto",
    "top analyst price target", "top analyst target price", "top analyst target for next week",
    "🎯", "📈", "📉", "🚀"
]

def is_advertising(text: str) -> bool:
    """if contains https or any keywords in filter，take it as ad"""
    if not isinstance(text, str):
        return False
    txt = text.lower()
    if 'https' in txt:
        return True
    return any(keyword in txt for keyword in advertising_keywords)

def clean_tweet(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http[s]?://\S+", "", text)  # drop links
    text = re.sub(r"www\.\S+", "", text)        # drop links
    text = re.sub(r"@\w+", "", text)            # drop @mention
    text = re.sub(r"#", "", text)               # drop # but keep hashtag text
    text = re.sub(r"[^\w\s]", "", text)         # drop punctuation and Emoji
    text = re.sub(r"\s+", " ", text).strip()    # drop unnecessary blank

    tokens = word_tokenize(text)
    cleaned = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(cleaned)

df = pd.read_csv("Nvidia-Tweets.csv")

# filter
before_count = len(df)
df = df[~df["Text"].apply(is_advertising)].copy()
after_count = len(df)
print(f"total：{before_count}，after filtering：{after_count}，drop：{before_count - after_count}")

# datetime
df["clean_tweet"] = df["Text"].apply(clean_tweet)
df["Datetime"] = pd.to_datetime(df["Datetime"], errors="coerce").dt.date

df.to_csv("nvidia_tweets_filtered_cleaned.csv", index=False)
print("save：nvidia_tweets_filtered_cleaned.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\81065\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\81065\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


total：100847，after filtering：18739，drop：82108
save：nvidia_tweets_filtered_cleaned.csv
