In [1]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn import metrics

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score

In [4]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

In [5]:
np.random.seed(2021)

In [6]:
data=pd.read_csv('OnlineNewsPopularity.csv')
data.columns=data.columns.str.replace(' ','')
bins = [0, 1400, 10000, np.inf]
data['category'] = pd.cut(data['shares'], bins, 
                          labels = ['Unpopular', 'Popular', 'Extremely_popular'])
my_dict={'Unpopular':0,'Popular':1,'Extremely_popular':2}
#inv_dict={0:'Unpopular',1:'Popular',2:'Extremely_popular'}
data['category']=data['category'].map(my_dict)
data.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,category
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,0
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711,0
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500,1
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200,0
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505,0


In [7]:
features_selected=['n_unique_tokens',
 'kw_max_avg',
 'kw_avg_avg',
 'self_reference_avg_sharess',
 'n_non_stop_unique_tokens',
 'kw_avg_min',
 'self_reference_min_shares',
 'LDA_00',
 'LDA_02',
 'data_channel_is_socmed',
 'data_channel_is_tech',
 'kw_min_min',
 'kw_avg_max',
 'kw_min_avg',
 'average_token_length',
 'is_weekend',
 'data_channel_is_entertainment',
 'n_non_stop_words',
 'LDA_03',
 'weekday_is_friday',
 'kw_max_min',
 'kw_max_max',
 'num_imgs',
 'LDA_04',
 'weekday_is_saturday',
 'LDA_01',
 'global_subjectivity',
 'weekday_is_sunday']

In [8]:
category_features = ['data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world','weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend']

In [9]:
numerical_features = ['n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords','kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess','LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_rate_negative_words', 'rate_positive_words',
       'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
       'max_positive_polarity', 'avg_negative_polarity',
       'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
       'title_sentiment_polarity', 'abs_title_subjectivity',
       'abs_title_sentiment_polarity']
numerical_features_selected = [i for i in numerical_features if i in features_selected]

In [10]:
#Model build
def Nerual_Network_model(layer_af='tanh',num_layers=1,optimizer='adam'):
    #sgd、adam、RMSprop
    model=Sequential()
    model.add(Dense(256,activation=layer_af))
    for i in range(num_layers):
        model.add(Dense(512,activation=layer_af))
    model.add(Dense(256,activation=layer_af))
    model.add(Dense(64,activation=layer_af))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])
    return model

In [11]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

class balance_data():
    def __init__(self):
        pass
    
    def under_sampling(X,y):
        rus = RandomUnderSampler()
        X_RUS, y_RUS = rus.fit_sample(X,y)
        return X_RUS, y_RUS
    
    def over_sampling(X,y):
        #X_text,y_label are pd.Series
        ros = RandomOverSampler()
        X_ROS, y_ROS = ros.fit_sample(X,y)
        return X_ROS, y_ROS

    def no_sampling(X,y):
        return X,y
    
    def smote_sampling(X,y):
        sm = SMOTE()
        X_SMOTE, y_SMOTE = sm.fit_sample(X,y)
        return X_SMOTE, y_SMOTE

In [12]:
## Baseline

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data[category_features+numerical_features],data['category'],
                                                    stratify=data['category'],test_size=0.2,random_state=2021)
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
scaler = StandardScaler()

scaler.fit(X_train[numerical_features])
X_train_norm[numerical_features] = scaler.transform(X_train[numerical_features])
X_test_norm[numerical_features] = scaler.transform(X_test_norm[numerical_features])
encoder = OneHotEncoder()
y_train_dummy = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))

X_train_norm=np.asarray(X_train_norm)
X_test_norm=np.asarray(X_test_norm)
y_train_dummy=y_train_dummy.toarray()

X_train_norm_b,y_train_dummy_b=balance_data.no_sampling(X_train_norm,y_train_dummy)

model=Nerual_Network_model()
model.fit(x=X_train_norm_b,y=y_train_dummy_b,verbose=0)
y_pred=np.argmax(model.predict(X_test_norm),axis=1)

print("Classification Report:")
print(classification_report(y_test,y_pred))
print('----------------------------')
print("Accuracy: %f" %(accuracy_score(y_test, y_pred)))
print('----------------------------')
print("F1-score:", f1_score(y_test, y_pred, average='macro'))

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.80      0.68      4017
           1       0.60      0.42      0.50      3475
           2       0.22      0.00      0.01       437

    accuracy                           0.59      7929
   macro avg       0.47      0.41      0.40      7929
weighted avg       0.57      0.59      0.56      7929

----------------------------
Accuracy: 0.593391
----------------------------
F1-score: 0.39567102585768715


In [15]:
## features-selection

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data[features_selected],data['category'],
                                                    stratify=data['category'],test_size=0.2,random_state=2021)
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
scaler = StandardScaler()
#scaler=MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train[numerical_features_selected])
X_train_norm[numerical_features_selected] = scaler.transform(X_train[numerical_features_selected])
X_test_norm[numerical_features_selected] = scaler.transform(X_test_norm[numerical_features_selected])
encoder = OneHotEncoder()
y_train_dummy = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))

X_train_norm=np.asarray(X_train_norm)
X_test_norm=np.asarray(X_test_norm)
y_train_dummy=y_train_dummy.toarray()

X_train_norm_b,y_train_dummy_b=balance_data.no_sampling(X_train_norm,y_train_dummy)

model=Nerual_Network_model()
model.fit(x=X_train_norm_b,y=y_train_dummy_b,verbose=0)
y_pred=np.argmax(model.predict(X_test_norm),axis=1)


print("Classification Report:")
print(classification_report(y_test,y_pred))
print('----------------------------')
print("Accuracy: %f" %(accuracy_score(y_test, y_pred)))
print('----------------------------')
print("F1-score:", f1_score(y_test, y_pred, average='macro'))

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.63      0.64      4017
           1       0.56      0.64      0.60      3475
           2       0.00      0.00      0.00       437

    accuracy                           0.60      7929
   macro avg       0.40      0.42      0.41      7929
weighted avg       0.57      0.60      0.58      7929

----------------------------
Accuracy: 0.600202
----------------------------
F1-score: 0.41112424546874476


In [17]:
## features-selection+random undersample

In [18]:
X_train, X_test, y_train, y_test = train_test_split(data[features_selected],data['category'],
                                                    stratify=data['category'],test_size=0.2,random_state=2021)
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
scaler = StandardScaler()
#scaler=MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train[numerical_features_selected])
X_train_norm[numerical_features_selected] = scaler.transform(X_train[numerical_features_selected])
X_test_norm[numerical_features_selected] = scaler.transform(X_test_norm[numerical_features_selected])
encoder = OneHotEncoder()
y_train_dummy = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))

X_train_norm=np.asarray(X_train_norm)
X_test_norm=np.asarray(X_test_norm)
y_train_dummy=y_train_dummy.toarray()

X_train_norm_b,y_train_dummy_b=balance_data.under_sampling(X_train_norm,y_train_dummy)

model=Nerual_Network_model()
model.fit(x=X_train_norm_b,y=y_train_dummy_b,verbose=0)
y_pred=np.argmax(model.predict(X_test_norm),axis=1)


print("Classification Report:")
print(classification_report(y_test,y_pred))
print('----------------------------')
print("Accuracy: %f" %(accuracy_score(y_test, y_pred)))
print('----------------------------')
print("F1-score:", f1_score(y_test, y_pred, average='macro'))

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.61      4017
           1       0.58      0.39      0.47      3475
           2       0.11      0.49      0.18       437

    accuracy                           0.50      7929
   macro avg       0.44      0.49      0.42      7929
weighted avg       0.59      0.50      0.52      7929

----------------------------
Accuracy: 0.495018
----------------------------
F1-score: 0.420192794007661


In [19]:
## features-selection+smote

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data[features_selected],data['category'],
                                                    stratify=data['category'],test_size=0.2,random_state=2021)
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
scaler = StandardScaler()
#scaler=MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train[numerical_features_selected])
X_train_norm[numerical_features_selected] = scaler.transform(X_train[numerical_features_selected])
X_test_norm[numerical_features_selected] = scaler.transform(X_test_norm[numerical_features_selected])
encoder = OneHotEncoder()
y_train_dummy = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))


X_train_norm=np.asarray(X_train_norm)
X_test_norm=np.asarray(X_test_norm)
y_train_dummy=y_train_dummy.toarray()

X_train_norm_b,y_train_dummy_b=balance_data.smote_sampling(X_train_norm,y_train_dummy)

model=Nerual_Network_model()
model.fit(x=X_train_norm_b,y=y_train_dummy_b,verbose=0)
y_pred=np.argmax(model.predict(X_test_norm),axis=1)



print("Classification Report:")
print(classification_report(y_test,y_pred))
print('----------------------------')
print("Accuracy: %f" %(accuracy_score(y_test, y_pred)))
print('----------------------------')
print("F1-score:", f1_score(y_test, y_pred, average='macro'))

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.53      0.59      4017
           1       0.55      0.39      0.46      3475
           2       0.11      0.57      0.18       437

    accuracy                           0.47      7929
   macro avg       0.45      0.50      0.41      7929
weighted avg       0.59      0.47      0.51      7929

----------------------------
Accuracy: 0.471056
----------------------------
F1-score: 0.4110174805974489


In [21]:
## features-selection+ tune

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data[features_selected],data['category'],
                                                    stratify=data['category'],test_size=0.2,random_state=2021)
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()
scaler = StandardScaler()
#scaler=MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train[numerical_features_selected])
X_train_norm[numerical_features_selected] = scaler.transform(X_train[numerical_features_selected])
X_test_norm[numerical_features_selected] = scaler.transform(X_test_norm[numerical_features_selected])
encoder = OneHotEncoder()
y_train_dummy = encoder.fit_transform(np.asarray(y_train).reshape(-1,1))

X_train_norm=np.asarray(X_train_norm)
X_test_norm=np.asarray(X_test_norm)
y_train_dummy=y_train_dummy.toarray()

X_train_norm_b,y_train_dummy_b=balance_data.no_sampling(X_train_norm,y_train_dummy)

model=Nerual_Network_model()
model.fit(x=X_train_norm_b,y=y_train_dummy_b,batch_size=15000,epochs=50,verbose=0)
y_pred=np.argmax(model.predict(X_test_norm),axis=1)


print("Classification Report:")
print(classification_report(y_test,y_pred))
print('----------------------------')
print("Accuracy: %f" %(accuracy_score(y_test, y_pred)))
print('----------------------------')
print("F1-score:", f1_score(y_test, y_pred, average='macro'))

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.73      0.68      4017
           1       0.60      0.57      0.59      3475
           2       0.22      0.00      0.01       437

    accuracy                           0.62      7929
   macro avg       0.49      0.44      0.43      7929
weighted avg       0.60      0.62      0.60      7929

----------------------------
Accuracy: 0.621390
----------------------------
F1-score: 0.4254035645100944
