# Setup and run NewsGroups Experiment
Adds known interactions to the 20-Newsgroups dataset to test whether NNs with Dropout recover these effects.

In [22]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

In [23]:
def TFIDF(X_train, X_test, MAX_NB_WORDS=1000): # was 75000
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

In [24]:
from tensorflow.keras.optimizers import Adam
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0, nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    opt = Adam(learning_rate=0.001*(1+dropout), beta_1=0.9,
                                   beta_2=0.999, epsilon=1e-07, amsgrad=False, name='Adam')
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model

In [25]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
n_samples = 1000
X_train = np.array(newsgroups_train.data[:n_samples])
X_test = np.array(newsgroups_test.data[:n_samples])
y_train = newsgroups_train.target[:n_samples]
y_test = newsgroups_test.target[:n_samples]

In [26]:
from sklearn.model_selection import train_test_split
X_train_tfidf, X_test_tfidf = TFIDF(X_train, X_test)
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2)

tf-idf with 1000 features


In [27]:
# Add 3 columns
order = 1
for i in range(order):
    X_train_tfidf = np.hstack((X_train_tfidf, np.random.uniform(0, 1, size=(X_train_tfidf.shape[0], 1))))
    X_val_tfidf = np.hstack((X_val_tfidf, np.random.uniform(0, 1, size=(X_val_tfidf.shape[0], 1))))
    X_test_tfidf = np.hstack((X_test_tfidf, np.random.uniform(0, 1, size=(X_test_tfidf.shape[0], 1))))

idx_train = X_train_tfidf[:, -1] > -1
idx_val   = X_val_tfidf[:, -1] > -1
idx_test  = X_test_tfidf[:, -1] > -1
thresh = 0.7**order
for i in range(1, order+1):
    idx_train = np.logical_and(idx_train, X_train_tfidf[:, -i] > thresh)
    idx_val   = np.logical_and(idx_val, X_val_tfidf[:, -i] > thresh)
    idx_test  = np.logical_and(idx_test, X_test_tfidf[:, -i] > thresh)
y_train[idx_train] = 21
y_val[idx_val] = 21
y_test[idx_test] = 21

'\nif order == 3:\n    y_train[np.logical_and(\n                np.logical_and(X_train_tfidf[:, -1]>0.5, X_train_tfidf[:, -2]>0.5),\n        X_train_tfidf[:, -3] > 0.5)\n        ] = 21\n    y_val[np.logical_and(\n                np.logical_and(X_val_tfidf[:, -1]>0.5, X_val_tfidf[:, -2]>0.5),\n        X_val_tfidf[:, -3] > 0.5)] = 21\n    y_test[np.logical_and(\n                np.logical_and(X_test_tfidf[:, -1]>0.5, X_test_tfidf[:, -2]>0.5),\n        X_test_tfidf[:, -3] > 0.5)] = 21\nelif order == 1:\n    y_train[X_train_tfidf[:, -1]>0.5] = 21\n    y_val[X_val_tfidf[:, -1]>0.5] = 21\n    y_test[X_test_tfidf[:, -1]>0.5] = 21\n'

In [29]:
from xgboost import XGBRegressor as xgb
from sklearn.multioutput import MultiOutputRegressor
import tensorflow as tf
max_epoch = 50
nClasses = np.max(y_train) + 1
with open("results/results_newsgroups_modified_{}.tsv".format(order), 'w') as results_file:
    for fit_iter in range(5):
        for dropout_rate in [0.0, 0.125, 0.25, 0.375, 0.5, 0.625]:
            model = Build_Model_DNN_Text(X_train_tfidf.shape[1], nClasses, dropout_rate)
            epoch = 0
            while epoch <= max_epoch:
                print(dropout_rate, fit_iter, epoch)
                pred_train = model.predict(X_train_tfidf)
                pred_test = model.predict(X_test_tfidf)
                
                def fit_and_predict(depth, X_train, y_train, X_test, y_test):
                    my_xgbs = [xgb(max_depth=depth, n_estimators=1000) for _ in range(nClasses)]
                    for i in range(len(my_xgbs)):
                        print(i, end='\r')
                        my_xgbs[i].fit(X_train, y_train, early_stopping_rounds=10,
                                                     eval_set=[(X_test, y_test)],
                                verbose=False
                        )
                    xgb_preds = np.array([x.predict(X_train_tfidf) for x in my_xgbs]).T
                    xgb_test_preds = np.array([x.predict(X_test_tfidf) for x in my_xgbs]).T
                    return my_xgbs, xgb_preds, xgb_test_preds
                
                xgb1s, xgb1_preds, xgb1_test_preds = fit_and_predict(
                    1, X_train_tfidf, pred_train, X_test_tfidf, pred_test)
                
                xgb2s, xgb2_preds, xgb2_test_preds = fit_and_predict(
                    2, X_train_tfidf, pred_train-xgb1_preds, X_test_tfidf, pred_test-xgb1_test_preds)
                
                #xgb3s, xgb3_preds, xgb3_test_preds = fit_and_predict(
                #    3, X_train_tfidf, pred_train-xgb1_preds-xgb2_preds, X_test_tfidf, pred_test-xgb1_test_preds-xgb2_test_preds)

                xgb1_var = np.var(xgb1_test_preds)
                xgb2_var = np.var(xgb2_test_preds)# - xgb1_preds)
                xgb3_var = np.var(pred_test - xgb1_test_preds - xgb2_test_preds)# - xgb2_preds)
                
                train_acc = np.mean(y_train == np.argmax(model.predict(X_train_tfidf), axis=1))
                query_acc = np.mean(y_test == np.argmax(model.predict(X_test_tfidf), axis=1))
                
                train_acc_one = np.mean(y_train[y_train == 21] == np.argmax(model.predict(X_train_tfidf), axis=1)[y_train == 21])
                query_acc_one = np.mean(y_test[y_test == 21] == np.argmax(model.predict(X_test_tfidf), axis=1)[y_test == 21])
                
                train_sq  = np.mean(tf.nn.sparse_softmax_cross_entropy_with_logits(y_train, model.predict(X_train_tfidf)))
                test_sq  = np.mean(tf.nn.sparse_softmax_cross_entropy_with_logits(y_test, model.predict(X_test_tfidf)))
                
                print(xgb1_var, xgb2_var, xgb3_var, train_acc, query_acc, train_sq, test_sq, train_acc_one, query_acc_one)
                print('{:.3f}\t{:d}\t{:d}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                        dropout_rate, fit_iter, epoch,
                        xgb1_var, xgb2_var, xgb3_var, train_acc, query_acc,
                    train_sq, test_sq, train_acc_one, query_acc_one),
                      file=results_file, flush=True)

                if epoch < 10: 
                    model.fit(X_train_tfidf, y_train,
                              validation_data=(X_val_tfidf, y_val),
                              epochs=5,
                              batch_size=128,
                              verbose=2)
                    epoch += 5
                else:
                    if epoch <= max_epoch:
                        model.fit(X_train_tfidf, y_train,
                                  validation_data=(X_val_tfidf, y_val),
                                  epochs=10,
                                  batch_size=128,
                                  verbose=0)
                    epoch += 10

0.0 0 0
3.8765053e-09 7.937654e-09 2.560359e-07 0.035 0.038 3.0911224 3.0911298 0.0 0.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.8922 - accuracy: 0.2650 - val_loss: 2.6101 - val_accuracy: 0.3000
Epoch 2/5
 - 0s - loss: 2.5660 - accuracy: 0.3088 - val_loss: 2.5380 - val_accuracy: 0.3000
Epoch 3/5
 - 0s - loss: 2.3848 - accuracy: 0.3088 - val_loss: 2.3871 - val_accuracy: 0.3000
Epoch 4/5
 - 0s - loss: 2.2285 - accuracy: 0.3125 - val_loss: 2.3438 - val_accuracy: 0.3150
Epoch 5/5
 - 0s - loss: 2.1443 - accuracy: 0.3438 - val_loss: 2.3544 - val_accuracy: 0.3200
0.0 0 5
0.00050775625 0.0013514793 0.018037694 0.3475 0.333 2.803844 2.8312452 1.0 1.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.0881 - accuracy: 0.3550 - val_loss: 2.2811 - val_accuracy: 0.3100
Epoch 2/5
 - 0s - loss: 2.0144 - accuracy: 0.3650 - val_loss: 2.4030 - val_accuracy: 0.3400
Epoch 3/5
 - 0s - loss: 1.8186 - accuracy: 0.4437 - val_loss: 2.1863 - val_accuracy: 0.33

0.5 0 30
0.0011043497 0.0014633778 0.020030163 0.715 0.349 2.5526671 2.8389113 1.0 0.7458745874587459
0.5 0 40
0.0009179549 0.001837324 0.027038984 0.93 0.427 2.3461442 2.760782 1.0 0.8283828382838284
0.5 0 50
0.0009699955 0.0018694787 0.027226035 0.98375 0.384 2.2547102 2.7990015 0.9959514170040485 0.6171617161716172
0.625 0 0
2.7601659e-09 7.044036e-09 2.706968e-07 0.02625 0.027 3.0912054 3.091198 0.0 0.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.9113 - accuracy: 0.2250 - val_loss: 2.6400 - val_accuracy: 0.3000
Epoch 2/5
 - 0s - loss: 2.6934 - accuracy: 0.3088 - val_loss: 2.8587 - val_accuracy: 0.3000
Epoch 3/5
 - 0s - loss: 2.5872 - accuracy: 0.3088 - val_loss: 2.5409 - val_accuracy: 0.3000
Epoch 4/5
 - 0s - loss: 2.4691 - accuracy: 0.3088 - val_loss: 2.5568 - val_accuracy: 0.3000
Epoch 5/5
 - 0s - loss: 2.3864 - accuracy: 0.3088 - val_loss: 2.4551 - val_accuracy: 0.3000
0.625 0 5
0.00034661582 0.00094555994 0.0048676524 0.30875 0.303 2.931823 2.9862266

Epoch 5/5
 - 0s - loss: 2.1107 - accuracy: 0.3475 - val_loss: 2.3988 - val_accuracy: 0.2850
0.375 1 10
0.0005019755 0.001057675 0.009982451 0.35375 0.257 2.8056877 2.9311733 1.0 0.735973597359736
0.375 1 20
0.00065182266 0.0012749119 0.015650975 0.5675 0.296 2.6608796 2.8832803 1.0 0.7095709570957096
0.375 1 30
0.000969134 0.0018703679 0.030536072 0.90875 0.423 2.3270245 2.7531555 1.0 0.8448844884488449
0.375 1 40
0.0009698671 0.001768013 0.036409866 0.99625 0.45 2.1943924 2.727168 1.0 0.8745874587458746
0.375 1 50
0.0009969907 0.0015400711 0.03660281 0.99875 0.469 2.1699636 2.7123754 1.0 0.8052805280528053
0.5 1 0
3.0464617e-09 4.6564064e-09 1.9197768e-07 0.02875 0.033 3.0911317 3.09112 0.0 0.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.8632 - accuracy: 0.2512 - val_loss: 2.5676 - val_accuracy: 0.3000
Epoch 2/5
 - 0s - loss: 2.5911 - accuracy: 0.3088 - val_loss: 2.4805 - val_accuracy: 0.3000
Epoch 3/5
 - 0s - loss: 2.4917 - accuracy: 0.3088 - val_loss: 2.5

Epoch 2/5
 - 0s - loss: 2.1001 - accuracy: 0.3525 - val_loss: 2.3265 - val_accuracy: 0.3050
Epoch 3/5
 - 0s - loss: 2.0885 - accuracy: 0.3475 - val_loss: 2.3073 - val_accuracy: 0.3050
Epoch 4/5
 - 0s - loss: 2.0560 - accuracy: 0.3537 - val_loss: 2.3292 - val_accuracy: 0.3100
Epoch 5/5
 - 0s - loss: 2.0193 - accuracy: 0.3525 - val_loss: 2.3100 - val_accuracy: 0.3000
0.25 2 10
0.0005219418 0.0011019164 0.013132507 0.3525 0.292 2.780389 2.8810885 1.0 0.8481848184818482
0.25 2 20
0.0010490366 0.002122301 0.027991977 0.8325 0.385 2.4478304 2.797854 1.0 0.8085808580858086
0.25 2 30
0.0009859751 0.0017362576 0.03559347 0.9925 0.404 2.1945198 2.7652373 1.0 0.7392739273927392
0.25 2 40
0.0009955916 0.0018760794 0.03764217 1.0 0.408 2.171317 2.760437 1.0 0.7194719471947195
0.25 2 50
0.0009941782 0.0018415569 0.040250555 1.0 0.425 2.1677246 2.7430046 1.0 0.7722772277227723
0.375 2 0
4.0216883e-09 8.69834e-09 2.8256426e-07 0.03875 0.029 3.0910337 3.0910692 0.0 0.0
Train on 800 samples, validate on

Epoch 5/5
 - 0s - loss: 2.1963 - accuracy: 0.3088 - val_loss: 2.3224 - val_accuracy: 0.3000
0.125 3 5
0.0005198933 0.001395938 0.016665457 0.30875 0.303 2.8076372 2.8426197 1.0 1.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.1375 - accuracy: 0.3162 - val_loss: 2.3046 - val_accuracy: 0.3300
Epoch 2/5
 - 0s - loss: 2.1190 - accuracy: 0.3650 - val_loss: 2.2890 - val_accuracy: 0.3300
Epoch 3/5
 - 0s - loss: 2.0931 - accuracy: 0.3462 - val_loss: 2.4001 - val_accuracy: 0.3300
Epoch 4/5
 - 0s - loss: 2.0614 - accuracy: 0.3487 - val_loss: 2.3410 - val_accuracy: 0.3250
Epoch 5/5
 - 0s - loss: 2.0185 - accuracy: 0.3713 - val_loss: 2.3545 - val_accuracy: 0.3300
0.125 3 10
0.00055159995 0.0012215774 0.013931638 0.38375 0.308 2.7910023 2.8746939 1.0 0.8910891089108911
0.125 3 20
0.00090200466 0.0018297726 0.03205943 0.93625 0.355 2.2749767 2.818592 0.9878542510121457 0.570957095709571
0.125 3 30
0.0009963617 0.0014857755 0.038679797 1.0 0.411 2.1685097 2.7525733 1.0 0.67

Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.9151 - accuracy: 0.2562 - val_loss: 2.6115 - val_accuracy: 0.3000
Epoch 2/5
 - 0s - loss: 2.5717 - accuracy: 0.3088 - val_loss: 2.5196 - val_accuracy: 0.3000
Epoch 3/5
 - 0s - loss: 2.3748 - accuracy: 0.3088 - val_loss: 2.4020 - val_accuracy: 0.3000
Epoch 4/5
 - 0s - loss: 2.2278 - accuracy: 0.3088 - val_loss: 2.3305 - val_accuracy: 0.3000
Epoch 5/5
 - 0s - loss: 2.1449 - accuracy: 0.3462 - val_loss: 2.3969 - val_accuracy: 0.3400
0.0 4 5
0.0005233224 0.0012511086 0.018563442 0.3575 0.334 2.804353 2.8286066 1.0 1.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.0919 - accuracy: 0.3587 - val_loss: 2.3139 - val_accuracy: 0.3300
Epoch 2/5
 - 0s - loss: 2.0488 - accuracy: 0.3663 - val_loss: 2.4155 - val_accuracy: 0.3450
Epoch 3/5
 - 0s - loss: 1.9368 - accuracy: 0.4038 - val_loss: 2.3862 - val_accuracy: 0.2950
Epoch 4/5
 - 0s - loss: 1.6631 - accuracy: 0.4613 - val_loss: 2.3142 - val_accuracy: 0

0.5 4 30
0.0010504604 0.0014939464 0.016929995 0.715 0.294 2.5448778 2.897026 0.9959514170040485 0.5313531353135313
0.5 4 40
0.0010487536 0.0016423465 0.02630633 0.8925 0.401 2.3756988 2.780827 1.0 0.8613861386138614
0.5 4 50
0.0009292054 0.0016354686 0.027861584 0.96125 0.395 2.2605555 2.7825792 1.0 0.7326732673267327
0.625 4 0
1.789074e-09 4.131625e-09 1.9578187e-07 0.045 0.039 3.0912213 3.0912373 0.0 0.0
Train on 800 samples, validate on 200 samples
Epoch 1/5
 - 0s - loss: 2.9109 - accuracy: 0.2313 - val_loss: 2.6363 - val_accuracy: 0.3000
Epoch 2/5
 - 0s - loss: 2.6400 - accuracy: 0.3088 - val_loss: 2.7281 - val_accuracy: 0.3000
Epoch 3/5
 - 0s - loss: 2.5318 - accuracy: 0.3088 - val_loss: 2.5577 - val_accuracy: 0.3000
Epoch 4/5
 - 0s - loss: 2.4064 - accuracy: 0.3088 - val_loss: 2.4956 - val_accuracy: 0.3000
Epoch 5/5
 - 0s - loss: 2.3137 - accuracy: 0.3088 - val_loss: 2.4371 - val_accuracy: 0.3000
0.625 4 5
0.00045726993 0.0009805044 0.0058679297 0.30875 0.303 2.9029298 2.9728057