In [2]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, accuracy_score

In [16]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

In [17]:
np.random.seed(2021)

In [7]:
data=pd.read_csv('OnlineNewsPopularity.csv')
data.columns=data.columns.str.replace(' ','')
bins = [0, 1400, 10000, np.inf]
data['category'] = pd.cut(data['shares'], bins, 
                          labels = ['Unpopular', 'Popular', 'Extremely_popular'])
my_dict={'Unpopular':0,'Popular':1,'Extremely_popular':2}
#transfer to 0,1,2
data['category']=data['category'].map(my_dict)

data.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,category
0,http://mashable.com/2013/01/07/amazon-instant-...,731,12,219,0.663594,1.0,0.815385,4,2,1,...,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,0
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731,9,255,0.604743,1.0,0.791946,3,1,1,...,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711,0
2,http://mashable.com/2013/01/07/apple-40-billio...,731,9,211,0.57513,1.0,0.663866,3,1,1,...,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500,1
3,http://mashable.com/2013/01/07/astronaut-notre...,731,9,531,0.503788,1.0,0.665635,9,0,1,...,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200,0
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731,13,1072,0.415646,1.0,0.54089,19,19,20,...,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505,0


In [8]:
features_selected=['n_unique_tokens',
 'kw_max_avg',
 'kw_avg_avg',
 'self_reference_avg_sharess',
 'n_non_stop_unique_tokens',
 'kw_avg_min',
 'self_reference_min_shares',
 'LDA_00',
 'LDA_02',
 'data_channel_is_socmed',
 'data_channel_is_tech',
 'kw_min_min',
 'kw_avg_max',
 'kw_min_avg',
 'average_token_length',
 'is_weekend',
 'data_channel_is_entertainment',
 'n_non_stop_words',
 'LDA_03',
 'weekday_is_friday',
 'kw_max_min',
 'kw_max_max',
 'num_imgs',
 'LDA_04',
 'weekday_is_saturday',
 'LDA_01',
 'global_subjectivity',
 'weekday_is_sunday']

In [9]:
category_features = ['data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world','weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend']

In [10]:
numerical_features = ['n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords','kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess','LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negative_words', 'rate_positive_words',
       'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
       'max_positive_polarity', 'avg_negative_polarity',
       'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
       'title_sentiment_polarity', 'abs_title_subjectivity',
       'abs_title_sentiment_polarity']

In [11]:
numerical_features_selected = [i for i in numerical_features if i in features_selected]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data[features_selected],data['category'],
                                                    stratify=data['category'],test_size=0.2,random_state=2021)

In [13]:
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
scaler = StandardScaler()

scaler.fit(X_train[numerical_features_selected])
X_train_norm[numerical_features_selected] = scaler.transform(X_train[numerical_features_selected])
X_test_norm[numerical_features_selected] = scaler.transform(X_test_norm[numerical_features_selected])

In [18]:
#Transfer to onehotencoder
encoder = OneHotEncoder()
y_train_dummy = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))

X_train_norm=np.asarray(X_train_norm)
X_test_norm=np.asarray(X_test_norm)
y_train_dummy=y_train_dummy.toarray()

In [19]:
#Model build
def Nerual_Network_model(layer_af='tanh',num_layers=1,optimizer='adam'):
    #sgd、adam、RMSprop
    model=Sequential()
    model.add(Dense(256,activation=layer_af))
    for i in range(num_layers):
        model.add(Dense(512,activation=layer_af))
    model.add(Dense(256,activation=layer_af))
    model.add(Dense(64,activation=layer_af))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
    return model

In [20]:
#GridSearch CV
clf=KerasClassifier(build_fn=Nerual_Network_model, verbose=0)
batch_size = [10000,12000,15000]
epochs = [50,100,150]
search_spaces=dict(batch_size=batch_size,epochs=epochs)

In [21]:
def clf_grid_cv(clf,search_spaces):
    grid = GridSearchCV(estimator=clf, param_grid=search_spaces,cv=5, n_jobs=-1)
    grid_result=grid.fit(X_train_norm, y_train_dummy)
    
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    print('---------------------------------------------------------')
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("Accuracy : %f ( Std : %f) with: %r" % (mean, stdev, param))
    return grid_result

In [22]:
grid_result=clf_grid_cv(clf,search_spaces)

Best: 0.617058 using {'batch_size': 15000, 'epochs': 50}
---------------------------------------------------------
Accuracy : 0.615608 ( Std : 0.007998) with: {'batch_size': 10000, 'epochs': 50}
Accuracy : 0.614977 ( Std : 0.001029) with: {'batch_size': 10000, 'epochs': 100}
Accuracy : 0.600883 ( Std : 0.005484) with: {'batch_size': 10000, 'epochs': 150}
Accuracy : 0.613180 ( Std : 0.003674) with: {'batch_size': 12000, 'epochs': 50}
Accuracy : 0.615009 ( Std : 0.004491) with: {'batch_size': 12000, 'epochs': 100}
Accuracy : 0.606905 ( Std : 0.004750) with: {'batch_size': 12000, 'epochs': 150}
Accuracy : 0.617058 ( Std : 0.004170) with: {'batch_size': 15000, 'epochs': 50}
Accuracy : 0.615355 ( Std : 0.004964) with: {'batch_size': 15000, 'epochs': 100}
Accuracy : 0.609175 ( Std : 0.007210) with: {'batch_size': 15000, 'epochs': 150}


In [26]:
#Test Score
y_pred=grid_result.predict(X_test_norm)
print('test score: %.2f%%' %(accuracy_score(y_test,y_pred)*100))

test score: 61.90%
