#### In this notebook I will be trying to fit my data to a neural network. I will be following some example models for applying deep learning to NLP datasets as described at https://nlpforhackers.io/keras-intro/

First I will load my cleaned dataset and perform a train-test split, then vectorize my data.

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
sent_token = nltk.sent_tokenize
import csv  
from nltk import sent_tokenize, word_tokenize, pos_tag
import re
from sklearn.feature_extraction.text import CountVectorizer
wpt = nltk.WordPunctTokenizer()


file_path_comments = r'~/Documents/Springboard/Springboard/Data/cleaned_comment_data.csv'

#file_path_comments = r'/mnt/c/Users/msteele9/Documents/Springboard/Springboard/Data/cleaned_comment_data.csv'
clean_comments = pd.read_csv(file_path_comments, index_col = False)

In [2]:
clean_comments['commentBody'].head(5)

0    This project makes me happy to be a 30+ year T...
1    Stunning photos and reportage. Infuriating tha...
2    Brilliant work from conception to execution. I...
3    NYT reporters should provide a contributor's l...
4       Could only have been done in print. Stunning. 
Name: commentBody, dtype: object

In [3]:
from sklearn.model_selection import train_test_split
import random
from datetime import datetime

X = clean_comments['commentBody']
y = clean_comments['recommendations']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random.seed(datetime.now()))

In [4]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(len(X_train) == len(y_train))

(32000,)
(8000,)
(32000,)
(8000,)
True


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
 

# Setting the vectorizer just like we would set a model 
cvec = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
                             lowercase=True, max_features=5000)
# Fitting the vectorizer on our training data 
cvec.fit(X_train)

X_train_df = pd.DataFrame(cvec.transform(X_train).todense(),
                       columns=cvec.get_feature_names())

X_test_df = pd.DataFrame(cvec.transform(X_test).todense(),
                      columns=cvec.get_feature_names())

word_counts = X_train_df.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

br           12645
trump         8269
people        6197
one           5228
would         5180
like          4800
us            3827
get           3501
even          3152
many          3146
time          3075
think         2924
way           2707
president     2702
world         2637
good          2563
could         2549
well          2476
know          2445
see           2440
dtype: int64

In [6]:
import keras
import tensorflow as tf


config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 5} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)


from keras.models import Sequential
from keras.layers import Dense
 
model = Sequential()
 
model.add(Dense(units=500, activation='relu', input_dim=len(cvec.get_feature_names())))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 500)               2500500   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 501       
Total params: 2,501,001
Trainable params: 2,501,001
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(X_train_df, y_train, 
          epochs=3, batch_size=100, verbose=1, 
          validation_data=(X_train_df, y_train))

Train on 32000 samples, validate on 32000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x210b9ce56d8>

In [8]:
scores = model.evaluate(cvec.transform(X_test), y_test, verbose=1)
print("Accuracy:", scores[1])  # Accuracy: 0.875

Accuracy: 0.001375


In [9]:
word2idx = {word: idx for idx, word in enumerate(cvec.get_feature_names())}
tokenize = cvec.build_tokenizer()
preprocess = cvec.build_preprocessor()
 
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes
 
print(to_sequence(tokenize, preprocess, word2idx, "This is an important test!"))  # [2269, 4453]
X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_train]
print(X_train_sequences[0])
 

[2318, 4513]
[3080, 4683, 3779, 4231, 1742, 1842, 2681, 2211, 12, 599, 599, 2062]


In [10]:
# Compute the max lenght of a text
MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGHT=", MAX_SEQ_LENGHT)
 
from keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(cvec.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)
print(X_train_sequences[0])

MAX_SEQ_LENGHT= 236
[5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000
 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5000 5

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding
 
model = Sequential()
model.add(Embedding(len(cvec.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGHT))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 236, 64)           320064    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 232, 64)           20544     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 46, 64)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2944)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                188480    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 529,153
Trainable params: 529,153
Non-trainable params: 0
_________________________________________________________________
None

In [12]:
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=3, batch_size=512, verbose=1,
          validation_data=(X_train_sequences[-100:], y_train[-100:]))

Train on 31900 samples, validate on 100 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x210b9e1aa90>

In [13]:
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_test]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)

In [14]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) # 0.8766

Accuracy: 0.0


In [15]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
 
model = Sequential()
model.add(Embedding(len(cvec.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGHT))
model.add(LSTM(64))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 236, 64)           320064    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 353,153
Trainable params: 353,153
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-100:], y_train[-100:]))

Train on 31900 samples, validate on 100 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x211471a97b8>

In [17]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) # 0.875

Accuracy: 0.0


In [18]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_md')
 
EMBEDDINGS_LEN = len(nlp.vocab['apple'].vector)
print("EMBEDDINGS_LEN=", EMBEDDINGS_LEN)  # 300
 
embeddings_index = np.zeros((len(cvec.get_feature_names()) + 1, EMBEDDINGS_LEN))
for word, idx in word2idx.items():
    try:
        embedding = nlp.vocab[word].vector
        embeddings_index[idx] = embedding
    except:
        pass

ModuleNotFoundError: No module named 'spacy'

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
 
model = Sequential()
model.add(Embedding(len(cvec.get_feature_names()) + 1,
                    EMBEDDINGS_LEN,  # Embedding size
                    weights=[embeddings_index],
                    input_length=MAX_SEQ_LENGHT,
                    trainable=False))
model.add(LSTM(300))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=1, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-100:], y_train[-100:]))
 
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1])  # 0.8508

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
 
# Shuffle the data and then split it, keeping 20% aside for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(X_train)
 
classifier = MLPClassifier(hidden_layer_sizes=(100,))
classifier.fit(vectorizer.transform(X_train), y_train)
 
print("Score:", classifier.score(vectorizer.transform(X_test), y_test))  # Score: 0.8816

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array
from sklearn.preprocessing import LabelBinarizer
 
 
class SimpleNeuralNetwork(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_layer_size=100, learning_rate=.1, epochs=1000, debug_print_epoch=10):
        assert hidden_layer_size > 0
        self.hidden_layer_size_ = hidden_layer_size
        self.learning_rate_ = learning_rate
        self.epochs_ = epochs
        self.debug_print_epoch_ = debug_print_epoch
 
    def fit(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=True)  # Makes sure the X and y play nice
 
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        # In this particular case, we'll make sure the number of classes is 2
        assert n_classes == 2
 
        n_samples, n_features = X.shape
 
        self.binarizer_ = LabelBinarizer().fit(y)
        Y_binary = self.binarizer_.transform(y)
 
        # Compute the weight matrices sizes and init with small random values
 
        # Hidden Layer
        self.A1_ = np.random.randn(n_features, self.hidden_layer_size_)
        # Output Layer
        self.A2_ = np.random.randn(self.hidden_layer_size_, 1)
 
        # ~~ SKIP TRAINING FOR NOW ~~
 
    def predict_proba(self, X):
        """ Output probabilities for each sample"""
        # make sure X is of an accepted type
        X = check_array(X, accept_sparse='csr')  
 
        # Apply linear function at the hidden layer
        Y_hidden = X.dot(self.A1_)
 
        # Apply sigmoid at the output layer
        Y_output = sigmoid(Y_hidden.dot(self.A2_))
 
        return np.hstack((1 - Y_output, Y_output))
 
    def predict(self, X):
        """ Output only the most likely class for each sample """
        scores = self.predict_proba(X)
        indices = scores.argmax(axis=1)
        return self.binarizer_.inverse_transform(indices)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
 
# Shuffle the data and then split it, keeping 20% aside for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer = CountVectorizer(lowercase=True, binary=True)
vectorizer.fit(X_train)
 
classifier = SimpleNeuralNetwork(hidden_layer_size=100, epochs=500, learning_rate=0.1)
classifier.fit(vectorizer.transform(X_train), list(y_train.values))
 
print("Score:", classifier.score(vectorizer.transform(X_test), y_test))  # 0.5056