<a href="https://colab.research.google.com/github/LimLawrence/LearningCodes/blob/master/NNSpamPackt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Packt 4  - data in 3
# use a bag of words but use a shallow neural network instead of the random forest. 
# Also remember that we got 95 or 96 percent accuracy for using random forest

In [10]:
#  Keras tokenizer as alternative technique 
import pandas as pd
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.models import Sequential # sequential model for NN. typical for feed forward network
from keras.layers import Dense, Dropout, Activation  #typical dense layers, dropout helps overfitting, decide on activation for each layer
from keras.utils import np_utils      # one hot encoding 
from sklearn.model_selection import StratifiedKFold #cross validation

In [11]:
# load datasets - stack on top of each other so 1 big dataset
d = pd.concat([pd.read_csv("Youtube01-Psy.csv"),
               pd.read_csv("Youtube02-KatyPerry.csv"),
               pd.read_csv("Youtube03-LMFAO.csv"),
               pd.read_csv("Youtube04-Eminem.csv"),
               pd.read_csv("Youtube05-Shakira.csv")])
#shuffle
d = d.sample(frac=1)
d.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
400,z12xxdjrvmynezpqt04chzxjrvqfxntibh0,lil jay,2014-09-10T03:52:22.309000,Check out my music niggas﻿,1
258,_2viQ_Qnc6_B7ncS0M0zl0VC4SZ22T1ZO_GNhI1IWTM,Greg Fils Aimé,2013-09-10T23:18:01.455000,I hope everyone is in good spirits I&#39;m a h...,1
329,z12pvhp42ouayd1xx04cgl4a4wu3sxmqnlw0k,SBG | Zombies,2015-05-21T20:37:21.421000,We need to get this to 1 Billion Views!!﻿,0
303,z13kyvhzivqmhjq1w220jnaxquu3grwkt,PatrickMcCrowell,2015-02-12T02:30:08.801000,Check out this playlist on YouTube:﻿,1
135,LneaDw26bFvpsz7rRi--uuuhcXD8DdMabES0ZpcLQlQ,Jacob Johnson,,You guys should check out this EXTRAORDINARY w...,1


In [12]:
#  take a number of splits (5), and produce the indexes of the original dataset for those splits
kfold = StratifiedKFold(n_splits=5)
splits = kfold.split(d, d['CLASS'])


In [13]:
# 80-20 training - test. the 20% testing will differ for each split
# so use loop to look at different splits
# check that they don't overlap for each of 5 splits

for train, test in splits:
  print("Splits")
  print(test)

Splits
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244

In [14]:
# define a function that receives these indexes for the different splits
# get bag of words, build neural net, train it, and evaluate it. 
# then return the score for that split. 
# begin by taking the positions for the train and test sets and extract the comments

def train_and_test(train_idx, test_idx):
    
    train_content = d['CONTENT'].iloc[train_idx]
    test_content = d['CONTENT'].iloc[test_idx]
    
    # for forest, ~1k- gridsearch for different parameters.  2k words better than 1k?
    tokenizer = Tokenizer(num_words=2000)
    
    # learn the training words (not the testing words!)
    # if new words in test set haven't seen before, they're ignored
    # transform both train and test into bag of words
    tokenizer.fit_on_texts(train_content)

    # options for mode: binary, freq, tfidf
    # creates matrix that can be fed directly into NN.  train_content are the comments 
    # decide if we want tfidf scores, binary scores, or frequency counts - tdif here between 0 and random integer
    d_train_inputs = tokenizer.texts_to_matrix(train_content, mode='tfidf')
    d_test_inputs = tokenizer.texts_to_matrix(test_content, mode='tfidf')

    # look at all tdif numbers , then divide tfidf by max 
    # scales between 0 and 1 - not good to give NN big number
    d_train_inputs = d_train_inputs/np.amax(np.absolute(d_train_inputs))
    d_test_inputs = d_test_inputs/np.amax(np.absolute(d_test_inputs))

    # subtract mean, to get values between -1 and 1
    # shift it between -1 and 1 by subtracting the average from each score
    d_train_inputs = d_train_inputs - np.mean(d_train_inputs)
    d_test_inputs = d_test_inputs - np.mean(d_test_inputs)

    # one-hot encoding of outputs - do the categorical 
    d_train_outputs = np_utils.to_categorical(d['CLASS'].iloc[train_idx])
    d_test_outputs = np_utils.to_categorical(d['CLASS'].iloc[test_idx])

    # build network all over again for each train/test split so it starts randomly.
    model = Sequential()  #typical feed forward NN

    #512 neurons in 1st layer, 2k inputs, same size as bag of words
    model.add(Dense(512, input_shape=(2000,)))  

    # use a ReLU activation fast and accurate, could use tanh
    model.add(Activation('relu')) 

    # when updated weights, don't update 1/2 of them randomly.  find weighted sum of inputs 
    model.add(Dropout(0.5))
    model.add(Dense(2))

    # take sum and run softmax turns outputs into probabilities
    model.add(Activation('softmax'))

    #compile model and calculate loss, do cate crossentropy for one hot encoding
    # adamax optimizer https://keras.io/ has the different optimizers
    model.compile(loss='categorical_crossentropy', optimizer='adamax',
                  metrics=['accuracy'])
    
    # run fit on training set, bag of words... go through training set 10x
    # go through 16 rows, compute avg loss and update weight
    model.fit(d_train_inputs, d_train_outputs, epochs=10, batch_size=16)

    # evaluate the test after fit (trained). not until now it looks at test
    # scores are going to be the loss & whatever other metrics like accuracy
    scores = model.evaluate(d_test_inputs, d_test_outputs)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    return scores


In [15]:
# build split again, which is the k-fold split with five different folds
# collect score for each split,  run our train_and_test function and save the scores. 
# it is running on each split. the accuracy on the training input increases per epoch. 
# if this gets really high over-fitting? but after the 10 epochs, use the testing set which it's never seen before. 
# This helps obtain the accuracy number for the testing set. 
# do it all again for the next split and we'll get a different accuracy. 
# do this a few more times until we have five different numbers, one for eachsplit.
# average is found as follows: 
kfold = StratifiedKFold(n_splits=5)
splits = kfold.split(d, d['CLASS'])
cvscores = []
for train_idx, test_idx, in splits:
    scores = train_and_test(train_idx, test_idx)
    cvscores.append(scores[1] * 100)

print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 94.39%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 95.40%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 94.37%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 93.35%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 95.14%
94.53% (+/- 0.72%)
