In [None]:
import pandas as pd
import spacy

In [4]:
df = pd.read_csv("nbc_articles_cleaned.csv")
df.head()

Unnamed: 0,title,date,link,author,content,tokens,tokens_with_ngrams,clean_text
0,Cubs infielder Matt Shaw defends missing game ...,2025-09-24T16:54:50.717Z,https://www.nbcnews.com/news/us-news/cubs-matt...,Minyvonne Burke,Chicago Cubs infielder Matt Shaw said he thoug...,"['chicago', 'cubs', 'infielder', 'matt', 'shaw...","['chicago', 'cubs', 'infielder', 'matt', 'shaw...",chicago cubs infielder matt shaw think importa...
1,YouTube to start bringing back creators banned...,2025-09-24T16:54:08.647Z,https://www.nbcnews.com/tech/tech-news/youtube...,The Associated Press,YouTube will offer creators a way to rejoin th...,"['youtube', 'offer', 'creator', 'way', 'rejoin...","['youtube', 'offer', 'creator', 'way', 'rejoin...",youtube offer creator way rejoin stream platfo...
2,A trio of space weather satellites blast off t...,2025-09-24T14:15:16.424Z,https://www.nbcnews.com/science/science-news/t...,The Associated Press,"CAPE CANAVERAL, Fla. — A cluster of space weat...","['cape', 'canaveral', 'fla.', 'cluster', 'spac...","['cape', 'canaveral', 'fla.', 'cluster', 'spac...",cape canaveral fla. cluster space weather sate...
3,Trump administration rehires hundreds of feder...,2025-09-24T13:35:34.846Z,https://www.nbcnews.com/politics/trump-adminis...,The Associated Press,MIAMI — Hundreds of federal employees who lost...,"['miami', 'federal', 'employee', 'lose', 'job'...","['miami', 'federal', 'employee', 'lose', 'job'...",miami federal employee lose job elon_musk cost...
4,NASA introduces its newest astronauts,2025-09-23T13:15:55.028Z,https://www.nbcnews.com/science/science-news/n...,The Associated Press,"CAPE CANAVERAL, Fla. — NASA introduced its new...","['cape', 'canaveral', 'fla.', 'nasa', 'introdu...","['cape', 'canaveral', 'fla.', 'nasa', 'introdu...",cape canaveral fla. nasa introduce new astrona...


In [None]:
# ====================================================
# 2. DEFINE SPLIT RANGES
# ====================================================
TRAIN_START = pd.to_datetime("2011-12-01").date()
TRAIN_END   = pd.to_datetime("2023-12-31").date()
VAL_START   = pd.to_datetime("2024-01-01").date()
VAL_END     = pd.to_datetime("2024-04-14").date()
TEST_START  = pd.to_datetime("2024-04-15").date()
TEST_END    = pd.to_datetime("2025-04-14").date()


In [None]:
# ====================================================
# 6. ASSIGN SPLIT LABEL
# ====================================================
def get_split(d):
    if TRAIN_START <= d <= TRAIN_END:
        return "train"
    elif VAL_START <= d <= VAL_END:
        return "val"
    elif TEST_START <= d <= TEST_END:
        return "test"
    else:
        return "exclude"

In [None]:
# ====================================================
# 1. LOAD SPACY MODEL (English)
# ====================================================
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # POS only, faster


In [None]:
# ====================================================
# 5. POS-TAGGING: KEEP ONLY NOUN, PRONOUN, ADJECTIVE, VERB
# ====================================================
def filter_pos(text):
    doc = nlp(str(text))
    allowed_pos = {"NOUN", "PROPN", "ADJ", "VERB", "PRON"}
    filtered = [t.text for t in doc if t.pos_ in allowed_pos]
    return " ".join(filtered)

1. Split Tesla articles data

In [None]:
# ====================================================
# 3. LOAD DATA
# ====================================================
df = pd.read_csv("nbc_articles_cleaned.csv")

# ensure date column is datetime.date
df["date"] = pd.to_datetime(df["date"]).dt.date

# ====================================================
# 4. GROUP BY DATE: CONCATENATE CLEAN_TEXT
# ====================================================
# Combine all article texts in one day
grouped_df = (
    df.groupby("date", as_index=False)
      .agg({
          "clean_text": lambda x: " ".join(x.astype(str))  # merge texts
      })
)

print(f"✅ Grouped by date — total unique days: {len(grouped_df)}")
print("🔍 Running POS tagging and filtering... (this may take a few minutes)")
grouped_df["clean_text"] = grouped_df["clean_text"].apply(filter_pos)

grouped_df["split"] = grouped_df["date"].apply(get_split)

# ====================================================
# 7. KEEP ONLY RELEVANT COLUMNS
# ====================================================
df_final = grouped_df[["date", "clean_text", "split"]]
df_final = df_final[df_final["split"] != "exclude"]

# ====================================================
# 8. SAVE TO NEW FILE
# ====================================================
df_final.to_csv("nbc_articles_split.csv", index=False)

print("✅ Saved as 'nbc_articles_split.csv'")
print(df_final["split"].value_counts())


✅ Grouped by date — total unique days: 3198
🔍 Running POS tagging and filtering... (this may take a few minutes)
✅ Saved as 'nbc_articles_split.csv'
split
train    2033
test      310
val        71
Name: count, dtype: int64


2. Split market articles data

In [11]:
# ====================================================
# 3. LOAD DATA
# ====================================================
df = pd.read_csv("market_articles_cleaned.csv")

# ensure date column is datetime.date
df["date"] = pd.to_datetime(df["date"]).dt.date

# ====================================================
# 4. GROUP BY DATE: CONCATENATE CLEAN_TEXT
# ====================================================
# Combine all article texts in one day
grouped_df = (
    df.groupby("date", as_index=False)
      .agg({
          "clean_text": lambda x: " ".join(x.astype(str))  # merge texts
      })
)

print(f"✅ Grouped by date — total unique days: {len(grouped_df)}")
print("🔍 Running POS tagging and filtering... (this may take a few minutes)")
grouped_df["clean_text"] = grouped_df["clean_text"].apply(filter_pos)

grouped_df["split"] = grouped_df["date"].apply(get_split)

# ====================================================
# 7. KEEP ONLY RELEVANT COLUMNS
# ====================================================
df_final = grouped_df[["date", "clean_text", "split"]]
df_final = df_final[df_final["split"] != "exclude"]

# ====================================================
# 8. SAVE TO NEW FILE
# ====================================================
df_final.to_csv("market_articles_split.csv", index=False)

print("✅ Saved as 'market_articles_split.csv'")
print(df_final["split"].value_counts())


✅ Grouped by date — total unique days: 6740
🔍 Running POS tagging and filtering... (this may take a few minutes)
✅ Saved as 'market_articles_split.csv'
split
train    3639
test      363
val       100
Name: count, dtype: int64
