## 例2-17 预测中的交互特征示例

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import sklearn.preprocessing as preproc

In [2]:
df = pd.read_csv('data/OnlineNewsPopularity/OnlineNewsPopularity.csv',
                 delimiter=', ', engine='python')

In [3]:
# 假设df是一个Pandas数据框，其中包含了UCI在线新闻流行度数据集
df.columns

Index(['url', 'timedelta', 'n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
     

In [4]:
# 选择与内容有关的特征作为模型的单一特征，忽略那些衍生特征
features = ['n_tokens_title', 'n_tokens_content',
            'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
            'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
            'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
            'data_channel_is_entertainment', 'data_channel_is_bus',
            'data_channel_is_socmed', 'data_channel_is_tech',
            'data_channel_is_world']

In [6]:
# 创建交互特征对，跳过固定偏移项 include_bias=False
X = df[features]
y = df[['shares']]
X2 = preproc.PolynomialFeatures(include_bias=False).fit_transform(X)
X2.shape

(39644, 170)

In [39]:
X.shape

(39644, 17)

In [42]:
X2

array([[1.20000000e+01, 2.19000000e+02, 6.63594467e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.00000000e+00, 2.55000000e+02, 6.04743081e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.00000000e+00, 2.11000000e+02, 5.75129531e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+01, 4.42000000e+02, 5.16355139e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.00000000e+00, 6.82000000e+02, 5.39493293e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.00000000e+01, 1.57000000e+02, 7.01986750e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [7]:
# 为两个特征集创建训练集和测试集
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X, X2, y,
                                                                         test_size=0.3,
                                                                         random_state=123)

In [33]:
y_test.shape

(11894, 1)

In [34]:
# 在两个特征集上训练模型并比较R方分数
def evaluate_feature(X_train, X_test, y_train, y_test):
    '''Fit a linear regression model on the training set and score on the test set'''
    model = linear_model.LinearRegression().fit(X_train, y_train)
    r_score = model.score(X_test, y_test)
    return (model, r_score)

In [37]:
%time
(m1, r1) = evaluate_feature(X1_train, X1_test, y_train, y_test)
print("R-squared score with singleton features: %0.5f" % r1)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs
R-squared score with singleton features: 0.00924


In [38]:
%time
(m2, r2) = evaluate_feature(X2_train, X2_test, y_train, y_test)
print("R-squared score with pairwise features: %0.10f" % r2)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs
R-squared score with pairwise features: 0.0113215252
