In [1]:
import pandas as pd

df_train = pd.read_csv('./dataset/news/train.csv')
df_test = pd.read_csv('./dataset/news/test.csv')
print(df_train.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [2]:
from bs4 import BeautifulSoup
import re

features = []
for i, html_content in enumerate(df_train["Page content"][:100]):
    # 使用 BeautifulSoup 解析 HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # 提取作者
    article_info = soup.head.find('div', class_='article-info')
    author_name = article_info.find('span', class_='author_name')
    if author_name != None:
        author = author_name.text
    elif article_info.find('span') != None:
        author = article_info.find('span').text
    else:
        author = article_info.find('a').text
    
    author = re.sub(r'^[Bb]y\s+', "", author)
    authors = re.split(r', | and | & ', author)
    authors = [author.strip() for author in authors]
    
    # 提取時間
    time = soup.head.find('time')['datetime']
    # 使用正則表示式提取時間訊息
    match = re.search(r'(\w+), (\d+) (\w+) (\d+) (\d+):(\d+):(\d+)', time) # Wed, 19 Jun 2013 15:04:30 +0000
    if match:
        day_of_week, day, month, year, hour, minute, second = match.groups()
    else:
        print("No match found")

    day_of_week_map = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
    month_map = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    
    day_of_week = day_of_week_map[day_of_week]
    month = month_map[month]
    

    # 提取標題
    title = soup.body.find('h1', class_='title').text

    # 提取频道
    channel = soup.body.find('article')['data-channel']

    # 提取主题
    topics = [a.text for a in soup.body.find('footer', class_='article-topics').find_all('a')]

    # 計算超連結数量
    num_links = len(soup.body.find_all('a', href=True))

    # 計算圖片數量
    num_images = len(soup.body.find_all('img', class_='microcontent'))

    # 提取内容長度
    content_len = len(soup.body.find('section', class_='article-content').text.strip())

    # 添加數據到DataFrame
    # features.append({'Title': title, 'Author': authors, 'Day of Week': day_of_week, 'Day': int(day), 'Month': month,
    #                 'Year': int(year), 'Hour': int(hour), 'Minute': int(minute), 'Second': int(second), 'Channel': channel,
    #                 'Topics': topics, 'Hyperlinks': num_links, 'Images': num_images, 'Content_len': content_len})
    
    features.append({'Day of Week': day_of_week, 'Day': int(day), 'Month': month,
                    'Year': int(year), 'Hour': int(hour), 'Minute': int(minute), 'Second': int(second), 'Channel': channel,
                    'Topics': topics, 'Hyperlinks': num_links, 'Images': num_images, 'Content_len': content_len})


In [3]:
df_X = pd.DataFrame(features)
df_y = df_train["Popularity"][:100]
print(df_X.shape)
print(df_y.shape)

(100, 12)
(100,)


In [4]:
df_X

Unnamed: 0,Day of Week,Day,Month,Year,Hour,Minute,Second,Channel,Topics,Hyperlinks,Images,Content_len
0,3,19,6,2013,15,4,30,world,"[Asteroid, Asteroids, challenge, Earth, Space,...",21,1,3588
1,4,28,3,2013,17,40,55,tech,"[Apps and Software, Google, open source, opn p...",16,1,1841
2,3,7,5,2014,19,15,20,entertainment,"[Entertainment, NFL, NFL Draft, Sports, Televi...",9,1,6638
3,5,11,10,2013,2,26,50,watercooler,"[Sports, Video, Videos, Watercooler]",11,0,1815
4,4,17,4,2014,3,31,43,entertainment,"[Entertainment, instagram, instagram video, NF...",14,1,8908
...,...,...,...,...,...,...,...,...,...,...,...,...
95,4,20,11,2014,3,54,30,world,"[Australia, brisbane, Climate, floods, Queensl...",35,1,2503
96,5,18,4,2014,16,22,5,world,"[archaeology, artifact, UK, World]",14,1,3179
97,4,17,1,2013,22,50,26,gaming,"[Entertainment, gallery, Gaming, iOS games, Te...",8,1,5105
98,5,3,1,2014,17,42,58,world,"[Space, snow, U.S., World, Climate]",39,1,4260


In [5]:
df_X["Topics"].unique()

TypeError: unhashable type: 'list'

In [None]:
from sklearn import preprocessing

labelen = preprocessing.LabelEncoder()
df_X["Author"] = labelen.fit_transform(df_X["Author"])

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['list']

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 劃分數據集為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)


# 方法1: 使用決策樹進行特徵選擇
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
feature_importance = clf.feature_importances_
print("Feature Importance (Decision Tree):")
print(feature_importance)

# 方法2: 使用SelectKBest和f_classif進行特徵選擇
selector = SelectKBest(score_func=f_classif, k=5)  # 選擇前5個特徵
X_train_selected = selector.fit_transform(X_train, y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_features = df_X.columns[selected_feature_indices]
print("Selected Features (SelectKBest):")
print(selected_features)

# 方法3: 使用隨機森林進行特徵選擇
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
feature_importance_rf = rf.feature_importances_
print("Feature Importance (Random Forest):")
print(feature_importance_rf)


# 評估特徵的影響
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (Decision Tree):", accuracy)

y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy (Random Forest):", accuracy_rf)

Feature Importance (Decision Tree):
[0.07241718 0.11079482 0.08333025 0.012505   0.14576546 0.10246802
 0.09564964 0.15002395 0.02221226 0.20483341]
Selected Features (SelectKBest):
Index(['Day of Week', 'Day', 'Hour', 'Hyperlinks', 'Images'], dtype='object')
Feature Importance (Random Forest):
[0.09038014 0.11990177 0.08867318 0.02553292 0.11402645 0.12356174
 0.13310079 0.1265391  0.01923169 0.15905223]
Accuracy (Decision Tree): 0.475
Accuracy (Random Forest): 0.555
