In [1]:
PATH_TO_DATA = 'input/train-balanced-sarcasm.csv'

# some necessary imports
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Utility
import re
from collections import Counter
import logging
import time
import pickle
import itertools
nltk.download('stopwords')

# DATASET
DATASET_ENCODING = "ISO-8859-1"

# TEXT CLENAING
TEXT_CLEANING_RE = "[^A-Za-z0-9]+"

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

train_df = pd.read_csv(PATH_TO_DATA, encoding =DATASET_ENCODING)

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df = train_df[['comment', 'label']]

df.comment = df.comment.apply(lambda x: preprocess(x))

texts = []
labels = []
texts_test = []
labels_test = []
for i in range (90000):
    texts.append(df.comment[i])
    labels.append(df.label[i])
for i in range (10000):
    texts_test.append(df.comment[i+90000])
    labels_test.append(df.label[i+90000])
    
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 15  # We will cut reviews after 100 words
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)  # https://keras.io/preprocessing/text/
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  # This turns strings into lists of integer indices.

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

d = 0
for i in range (len(sequences)):
    if len(sequences[i-d]) == 0:
        del sequences[i-d]
        del labels[i-d]
        d = d+1
        
data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

onehot = np.zeros((data.shape[0], max_words))
for i in range(data.shape[0]):
    for j in range(maxlen):
        temp2 = data [i]
        if temp2[j] != 0:
            a = temp2[j]
            onehot[i][a] = 1
            
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import confusion_matrix
from models import SupervisedDBNClassification

# Loading dataset
X, Y = onehot, labels

# Data scaling
X = (X).astype(np.float32)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=2/9, random_state=0)
#X_train = X[1000:5000]
#X_test = X[0:1000]
#Y_train = Y[1000:5000]
#Y_test= Y[0:1000]
print (X_train.shape)
print (X_test.shape)
print (Y_train.shape)
print (Y_test.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liyuantan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 46378 unique tokens.
Shape of data tensor: (86775, 15)
Shape of label tensor: (86775,)
(67491, 10000)
(19284, 10000)
(67491,)
(19284,)


In [2]:
# Training
classifier = SupervisedDBNClassification(hidden_layers_structure=[16, 16],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.1,
                                         n_epochs_rbm=10,
                                         n_iter_backprop=10,
                                         batch_size=16,
                                         activation_function='relu',
                                         dropout_p=0.1)
classifier.fit(X_train, Y_train)

# Test
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 4.601912
>> Epoch 2 finished 	RBM Reconstruction error 4.463560
>> Epoch 3 finished 	RBM Reconstruction error 4.368040
>> Epoch 4 finished 	RBM Reconstruction error 4.429434
>> Epoch 5 finished 	RBM Reconstruction error 4.651951
>> Epoch 6 finished 	RBM Reconstruction error 5.275913
>> Epoch 7 finished 	RBM Reconstruction error 5.679036
>> Epoch 8 finished 	RBM Reconstruction error 5.862816
>> Epoch 9 finished 	RBM Reconstruction error 6.205753
>> Epoch 10 finished 	RBM Reconstruction error 6.590371
>> Epoch 1 finished 	RBM Reconstruction error 3.939355
>> Epoch 2 finished 	RBM Reconstruction error 2.650579
>> Epoch 3 finished 	RBM Reconstruction error 1.820835
>> Epoch 4 finished 	RBM Reconstruction error 4.330629
>> Epoch 5 finished 	RBM Reconstruction error 3.512999
>> Epoch 6 finished 	RBM Reconstruction error 2.357424
>> Epoch 7 finished 	RBM Reconstruction error 2.371319
>> Epoch 8 finished 	RBM Reconstructi