In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
from pathlib import Path

DATA = Path('data')

## Data Clean
- 将所有文本转为小写。
- 移除URL、@用户、#话题标签、标点符号和数字，只保留文本内容。

In [2]:
# --- 1. 加载数据 ---
print("正在加载数据...")
train_df = pd.read_csv(DATA / 'train.csv')
val_df = pd.read_csv(DATA / 'val.csv')
train_df.head()

正在加载数据...


Unnamed: 0,id,text,label,event
0,536844329000906752,Nice to see @MaryLaneWSJ's scoop confirmed: Sw...,1,0
1,536847375802052609,"Nearly 3 years of silence, transparency re #Gu...",1,0
2,536831367959420929,Swiss Museum Will Accept Gurlitt Art Trove - A...,1,0
3,536825153372831744,The Kunstmuseum Bern agrees to takes the contr...,1,0
4,536827886926565376,Bern Fine Arts Museum says yes to controversia...,1,0


In [3]:
# 填充可能存在的NaN值
train_df['text'] = train_df['text'].fillna('')
val_df['text'] = val_df['text'].fillna('')
train_df['event'] = train_df['event'].fillna('unknown')
val_df['event'] = val_df['event'].fillna('unknown')


In [4]:
# --- 2. 文本预处理函数 ---
def preprocess_text(text):
    """清洗文本数据"""
    text = text.lower()  # 转为小写
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # 移除URL
    text = re.sub(r'\@\w+|\#', '', text) # 移除@和#
    text = re.sub(r'[^a-z\s]', '', text) # 只保留英文字母和空格
    return text


In [5]:
print("正在预处理文本...")
train_df['clean_text'] = train_df['text'].apply(preprocess_text)
val_df['clean_text'] = val_df['text'].apply(preprocess_text)
train_df.head()

正在预处理文本...


Unnamed: 0,id,text,label,event,clean_text
0,536844329000906752,Nice to see @MaryLaneWSJ's scoop confirmed: Sw...,1,0,nice to see s scoop confirmed swiss museum acc...
1,536847375802052609,"Nearly 3 years of silence, transparency re #Gu...",1,0,nearly years of silence transparency re gurli...
2,536831367959420929,Swiss Museum Will Accept Gurlitt Art Trove - A...,1,0,swiss museum will accept gurlitt art trove ab...
3,536825153372831744,The Kunstmuseum Bern agrees to takes the contr...,1,0,the kunstmuseum bern agrees to takes the contr...
4,536827886926565376,Bern Fine Arts Museum says yes to controversia...,1,0,bern fine arts museum says yes to controversia...


In [6]:
print("正在进行特征工程...")
# 3.1 文本特征 (TF-IDF)
# 使用在训练集上构建的Vectorizer来转换训练集和验证集，保证特征空间一致
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_text = tfidf_vectorizer.fit_transform(train_df['clean_text'])
X_val_text = tfidf_vectorizer.transform(val_df['clean_text'])


正在进行特征工程...


In [7]:
# 3.2 Event类别特征 (One-Hot Encoding)
# 同样，使用在训练集上构建的Encoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
X_train_event = onehot_encoder.fit_transform(train_df[['event']])
X_val_event = onehot_encoder.transform(val_df[['event']])

In [8]:
# 3.3 合并特征
X_train_combined = hstack([X_train_text, X_train_event])
X_val_combined = hstack([X_val_text, X_val_event])

In [9]:
# 准备标签
y_train = train_df['label']
y_val = val_df['label']

In [10]:
# --- 4. 模型训练 ---
print("正在训练逻辑回归模型...")
model = LogisticRegression(solver='liblinear', C=5, random_state=19260817)
model.fit(X_train_combined, y_train)

正在训练逻辑回归模型...


In [11]:
# --- 5. 模型评估 ---
print("正在评估模型...")
y_pred = model.predict(X_val_combined)
accuracy = accuracy_score(y_val, y_pred)

正在评估模型...


In [12]:
print(f"\n模型在验证集上的准确率: {accuracy:.4f}")
print("\n分类报告:")
print(classification_report(y_val, y_pred))


模型在验证集上的准确率: 0.8370

分类报告:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       226
           1       0.86      0.75      0.80       179

    accuracy                           0.84       405
   macro avg       0.84      0.83      0.83       405
weighted avg       0.84      0.84      0.84       405

