In [1]:
import pandas as pd
import re

# Read News
df = pd.read_csv("archive/Combined_News_DJIA.csv")

# Combine Top1~Top25 as one day one word
df['Combined_News'] = df.iloc[:, 2:27].astype(str).apply(lambda row: ' '.join(row.values), axis=1)

# 清洗文字：小寫化 + 去除標點
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    return text

df['Cleaned_News'] = df['Combined_News'].apply(clean_text)


In [2]:
sample_up = df[df['Label'] == 1].iloc[0]
sample_down = df[df['Label'] == 0].iloc[0]

print("=== Sample: Market Up ===")
print("Date:", sample_up['Date'])
print("Label:", sample_up['Label'])
print("Cleaned News:\n", sample_up['Cleaned_News'][:600], "...")

print("\n=== Sample: Market Down ===")
print("Date:", sample_down['Date'])
print("Label:", sample_down['Label'])
print("Cleaned News:\n", sample_down['Cleaned_News'][:600], "...")

=== Sample: Market Up ===
Date: 2008-08-11
Label: 1
Cleaned News:
 bwhy wont america and nato help us if they wont help us now why did we help them in iraq bbush puts foot down on georgian conflict bjewish georgian minister thanks to israeli training were fending off russia  bgeorgian army flees in disarray as russians advance  gori abandoned to russia without a shot fired bolympic opening ceremony fireworks faked bwhat were the mossad with fraudulent new zealand passports doing in iraq brussia angered by israeli military sale to georgia ban american citizen living in sossetia blames us and georgian leaders for the genocide of innocent people bwelcome to worl ...

=== Sample: Market Down ===
Date: 2008-08-08
Label: 0
Cleaned News:
 bgeorgia downs two russian warplanes as countries move to brink of war bbreaking musharraf to be impeached brussia today columns of troops roll into south ossetia footage from fighting youtube brussian tanks are moving towards the capital of south ossetia wh

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_News'])
y = df['Label']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.49748743718592964
Confusion Matrix:
 [[ 43 142]
 [ 58 155]]
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.23      0.30       185
           1       0.52      0.73      0.61       213

    accuracy                           0.50       398
   macro avg       0.47      0.48      0.45       398
weighted avg       0.48      0.50      0.47       398

