In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
# Basic imports
import os
import re
import pandas as pd
import numpy as np

# Sk learn preprocessors
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Sklearn models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Activation, Conv1D, GlobalMaxPooling1D, BatchNormalization
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers
from keras import backend as K
from keras.utils import to_categorical

# Sklearn utility functions
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Load base cnn
from functions.models import get_base_cnn


Using TensorFlow backend.


In [None]:
# 2 CLASS CASE

In [None]:
# Read the data, remove the 'neutral' label and change -1 to 0 
my_path = 'data/sentiment_data_1500_manual.csv'
my_data = pd.read_csv(my_path, usecols = ['text', 'label'], encoding = 'latin-1')
my_data = my_data.drop(my_data[my_data.label == 0].index).reset_index(drop = True)
my_data.loc[my_data.label == -1, 'label'] = 0

In [None]:
## Global preprocessing

# Replace upper letters with their lower letter counterparts
my_data['text'] = my_data['text'].apply(lambda x: x.lower())

# Remove unnecessary stuff
my_data['text'] = my_data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [None]:
## Very easy pre processing according to sklearns CountVectorizer

# Split the data into one list containing the tweets and one containing the labels
my_tweets = my_data['text'].values.tolist()
my_labels = my_data['label'].values.tolist()

# Use a CountVectorizer to preprocess the data
my_count_vec = CountVectorizer(analyzer = 'word')
my_count_vec.fit(my_tweets)
my_tweets = my_count_vec.transform(my_tweets)

# Print this 
print('\n')
print('My tweets matrix:')
print('\n')
print(my_tweets.toarray())
print('\n')
print('min:', np.amin(my_tweets.toarray()), 'max:', np.amax(my_tweets.toarray()))
print('\n')
print('mean:', np.mean(my_tweets.toarray()))
print('\n')
print('unique elements:', len(np.unique(my_tweets.toarray())))
print('\n')
print('dimensions:', my_tweets.toarray().shape)
print('\n')

# Generate a train and val split 
my_train_prop = 0.66
X_train, X_val, y_train, y_val = train_test_split(my_tweets, my_labels, 
                                                  train_size = my_train_prop, test_size  = 1 - my_train_prop, 
                                                  random_state = 1)

In [None]:
# Create a Naive Bayes model
my_bayes_mod = MultinomialNB(alpha = 1, fit_prior = True)
my_bayes_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_bayes_mod_acc = accuracy_score(y_val, my_bayes_mod.predict(X_val))
print('Naive Bayes accuracy:', my_bayes_mod_acc)

# Create a logistic regression model
my_reg_mod = LogisticRegression(penalty = 'l2', C = 1, solver = 'liblinear')
my_reg_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_reg_mod_acc = accuracy_score(y_val, my_reg_mod.predict(X_val))
print('Logistic regression accuracy:', my_reg_mod_acc)

# Create classification tree
my_tree_mod = tree.DecisionTreeClassifier(criterion = 'gini')
my_tree_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_tree_mod_acc = accuracy_score(y_val, my_tree_mod.predict(X_val))
print('Classification tree accuracy:', my_tree_mod_acc)

# Create random forest
my_forest_mod = RandomForestClassifier(criterion = 'gini', n_estimators = 500)
my_forest_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_forest_mod_acc = accuracy_score(y_val, my_forest_mod.predict(X_val))
print('Random forest accuracy:', my_forest_mod_acc)

# Creating a gradient boosting model
my_boosting_mod = GradientBoostingClassifier(loss = 'deviance', learning_rate = 0.01, n_estimators = 500)
my_boosting_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_boosting_mod_acc = accuracy_score(y_val, my_boosting_mod.predict(X_val))
print('Gradient boosting accuracy:', my_boosting_mod_acc)

# Create support vector machine
my_svm_mod = SVC(C = 100, kernel = 'rbf', gamma = 'auto')
my_svm_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_svm_mod_acc = accuracy_score(y_val, my_svm_mod.predict(X_val))
print('Support vector machine accuracy:', my_svm_mod_acc)

In [None]:
## Slightly more sophisticated pre processing according to sklearns CountVectorizer and td-idf 

# Split the data into one list containing the tweets and one containing the labels
my_tweets = my_data['text'].values.tolist()
my_labels = my_data['label'].values.tolist()

# Use a CountVectorizer to preprocess the data
my_count_vec = CountVectorizer(analyzer = 'word')
my_count_vec.fit(my_tweets)
my_tweets = my_count_vec.transform(my_tweets)

# Use tf-idf for even smarter preprocessing
my_tfidf = TfidfTransformer()
my_tweets = my_tfidf.fit_transform(my_tweets)

# Print this 
print('\n')
print('My tweets matrix:')
print('\n')
print(my_tweets.toarray())
print('\n')
print('min:', np.amin(my_tweets.toarray()), 'max:', np.amax(my_tweets.toarray()))
print('\n')
print('mean:', np.mean(my_tweets.toarray()))
print('\n')
print('unique elements:', len(np.unique(my_tweets.toarray())))
print('\n')
print('dimensions:', my_tweets.toarray().shape)
print('\n')

# Generate a train and val split 
my_train_prop = 0.66
X_train, X_val, y_train, y_val = train_test_split(my_tweets, my_labels, 
                                                  train_size = my_train_prop, test_size  = 1 - my_train_prop, 
                                                  random_state = 1)

In [None]:
# Create a Naive Bayes model
my_bayes_mod = MultinomialNB(alpha = 1, fit_prior = True)
my_bayes_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_bayes_mod_acc = accuracy_score(y_val, my_bayes_mod.predict(X_val))
print('Naive Bayes accuracy:', my_bayes_mod_acc)

# Create a logistic regression model
my_reg_mod = LogisticRegression(penalty = 'l2', C = 1, solver = 'liblinear')
my_reg_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_reg_mod_acc = accuracy_score(y_val, my_reg_mod.predict(X_val))
print('Logistic regression accuracy:', my_reg_mod_acc)

# Create classification tree
my_tree_mod = tree.DecisionTreeClassifier(criterion = 'gini')
my_tree_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_tree_mod_acc = accuracy_score(y_val, my_tree_mod.predict(X_val))
print('Classification tree accuracy:', my_tree_mod_acc)

# Create random forest
my_forest_mod = RandomForestClassifier(criterion = 'gini', n_estimators = 500)
my_forest_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_forest_mod_acc = accuracy_score(y_val, my_forest_mod.predict(X_val))
print('Random forest accuracy:', my_forest_mod_acc)

# Creating a gradient boosting model
my_boosting_mod = GradientBoostingClassifier(loss = 'deviance', learning_rate = 0.01, n_estimators = 500)
my_boosting_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_boosting_mod_acc = accuracy_score(y_val, my_boosting_mod.predict(X_val))
print('Gradient boosting accuracy:', my_boosting_mod_acc)

# Create support vector machine
my_svm_mod = SVC(C = 100, kernel = 'rbf', gamma = 'auto')
my_svm_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_svm_mod_acc = accuracy_score(y_val, my_svm_mod.predict(X_val))
print('Support vector machine accuracy:', my_svm_mod_acc)

In [None]:
## Very sophisticated Keras Tokenizer

# Two hyperparameters for the tokenizer
tokenizer_max_features = 5000
tokenizer_maxlen = 40

# Create the tokenizer
tokenizer = Tokenizer(num_words = tokenizer_max_features, split = ' ')
tokenizer.fit_on_texts(my_data['text'].values)

# Use the tokenizer
my_tweets = tokenizer.texts_to_sequences(my_data['text'].values)
my_tweets = pad_sequences(my_tweets, maxlen = tokenizer_maxlen)
my_labels = my_data['label']

# Generate a train and val split 
my_train_prop = 0.66
X_train, X_val, y_train, y_val = train_test_split(my_tweets, my_labels,
                                                  train_size = my_train_prop, test_size  = 1 - my_train_prop, 
                                                  random_state = 1)

In [None]:
# Create a Naive Bayes model
my_bayes_mod = MultinomialNB(alpha = 1, fit_prior = True)
my_bayes_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_bayes_mod_acc = accuracy_score(y_val, my_bayes_mod.predict(X_val))
my_bayes_mod_conf = confusion_matrix(y_val, my_bayes_mod.predict(X_val))
print('Naive Bayes accuracy:', my_bayes_mod_acc)
print('Naive Bayes confmatrix:')
print(my_bayes_mod_conf)

# Create a logistic regression model
my_reg_mod = LogisticRegression(penalty = 'l2', C = 1, solver = 'liblinear')
my_reg_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_reg_mod_acc = accuracy_score(y_val, my_reg_mod.predict(X_val))
my_reg_mod_conf = confusion_matrix(y_val, my_reg_mod.predict(X_val))
print('Logistic regression accuracy:', my_reg_mod_acc)
print('LogReg confmatrix:')
print(my_reg_mod_conf)

# Create classification tree
my_tree_mod = tree.DecisionTreeClassifier(criterion = 'gini')
my_tree_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_tree_mod_acc = accuracy_score(y_val, my_tree_mod.predict(X_val))
my_tree_mod_conf = confusion_matrix(y_val, my_tree_mod.predict(X_val))
print('Classification tree accuracy:', my_tree_mod_acc)
print('Classification Tree confmatrix:')
print(my_tree_mod_conf)


# Create random forest
my_forest_mod = RandomForestClassifier(criterion = 'gini', n_estimators = 500, verbose = 2)
my_forest_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_forest_mod_acc = accuracy_score(y_val, my_forest_mod.predict(X_val))
my_forest_mod_conf = confusion_matrix(y_val, my_forest_mod.predict(X_val))
print('Random forest accuracy:', my_forest_mod_acc)
print('Random forest confmatrix:')
print(my_forest_mod_conf)

# Creating a gradient boosting model
my_boosting_mod = GradientBoostingClassifier(loss = 'deviance', learning_rate = 0.01, n_estimators = 500, verbose = 2)
my_boosting_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_boosting_mod_acc = accuracy_score(y_val, my_boosting_mod.predict(X_val))
my_boosting_mod_conf = confusion_matrix(y_val, my_boosting_mod.predict(X_val))
print('Gradient boosting accuracy:', my_boosting_mod_acc)
print('Gradient boosting confmatrix:')
print(my_boosting_mod_conf)

# Create support vector machine
my_svm_mod = SVC(C = 100, kernel = 'rbf', gamma = 'auto', verbose = 2)
my_svm_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_svm_mod_acc = accuracy_score(y_val, my_svm_mod.predict(X_val))
my_svm_mod_conf = confusion_matrix(y_val, my_svm_mod.predict(X_val))
print('Support vector machine accuracy:', my_svm_mod_acc)
print('Support vector machine confmatrix:')
print(my_svm_mod_conf)





## CNN
print('\n')
print('Training CNN..')
print('\n')

# Model hyperparameters
my_max_features = tokenizer_max_features
my_embedding_dims = 50
my_maxlen = tokenizer_maxlen

my_filters = 16
my_kernel_size = 3
num_hidden_dims = 16

# Get model Architecture
my_cnn = get_base_cnn(max_features = my_max_features, 
                      embedding_dims = my_embedding_dims, 
                      maxlen = my_maxlen, 
                      num_conv_filters = my_filters, 
                      kernel_size = my_kernel_size, 
                      num_hidden_dims = num_hidden_dims)

# Compile the model
my_cnn.compile(loss = 'categorical_crossentropy',
               optimizer = 'adam',
               metrics = ['categorical_accuracy'])

# training hyperparameters
num_batch_size = 8
num_epochs = 10

# Fit the model
my_model = my_cnn.fit(X_train, y_train,
                      batch_size = num_batch_size,
                      epochs = num_epochs,
                      validation_data = (X_val, y_val),
                      verbose = 1)

print('CNN accuracy:', my_model.history['val_acc'][-1])

In [None]:
# 3 CLASS CASE (BASE MODELS)

In [None]:
# Read the data, remove the 'neutral' label and change -1 to 0 because only idiots use negative integers for labels
my_path = 'data/sentiment_data_900_manual.csv'
my_data = pd.read_csv(my_path, usecols = ['text', 'label'], encoding = 'latin-1')
my_data.loc[my_data.label == 1, 'label'] = 2
my_data.loc[my_data.label == 0, 'label'] = 1
my_data.loc[my_data.label == -1, 'label'] = 0

In [None]:
## Global preprocessing

# Replace upper letters with their lower letter counterparts
my_data['text'] = my_data['text'].apply(lambda x: x.lower())

# Remove unnecessary stuff
my_data['text'] = my_data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [None]:
## Very sophisticated Keras Tokenizer

# Two hyperparameters for the tokenizer
tokenizer_max_features = 5000
tokenizer_maxlen = 40

# Create the tokenizer
tokenizer = Tokenizer(num_words = tokenizer_max_features, split = ' ')
tokenizer.fit_on_texts(my_data['text'].values)

# Use the tokenizer
my_tweets = tokenizer.texts_to_sequences(my_data['text'].values)
my_tweets = pad_sequences(my_tweets, maxlen = tokenizer_maxlen)
my_labels = my_data['label']

# Generate a train and val split 
my_train_prop = 0.66
X_train, X_val, y_train, y_val = train_test_split(my_tweets, my_labels,
                                                  train_size = my_train_prop, test_size  = 1 - my_train_prop, 
                                                  random_state = 1338)

In [None]:
# Create a Naive Bayes model
my_bayes_mod = MultinomialNB(alpha = 1, fit_prior = True)
my_bayes_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_bayes_mod_acc = accuracy_score(y_val, my_bayes_mod.predict(X_val))
print('Naive Bayes accuracy:', my_bayes_mod_acc)

# Create a logistic regression model
my_reg_mod = LogisticRegression(penalty = 'l2', C = 1, solver = 'newton-cg', multi_class = 'multinomial')
my_reg_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_reg_mod_acc = accuracy_score(y_val, my_reg_mod.predict(X_val))
print('Logistic regression accuracy:', my_reg_mod_acc)

# Create classification tree
my_tree_mod = tree.DecisionTreeClassifier(criterion = 'gini')
my_tree_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_tree_mod_acc = accuracy_score(y_val, my_tree_mod.predict(X_val))
print('Classification tree accuracy:', my_tree_mod_acc)

# Create random forest
my_forest_mod = RandomForestClassifier(criterion = 'gini', n_estimators = 500)
my_forest_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_forest_mod_acc = accuracy_score(y_val, my_forest_mod.predict(X_val))
print('Random forest accuracy:', my_forest_mod_acc)

# Creating a gradient boosting model
my_boosting_mod = GradientBoostingClassifier(loss = 'deviance', n_estimators = 500,
                                             learning_rate = 0.01, subsample = 0.75, max_features = 30)
my_boosting_mod.fit(X_train, y_train)
# Predict validation data and compute accuracy
my_boosting_mod_acc = accuracy_score(y_val, my_boosting_mod.predict(X_val))
print('Gradient boosting accuracy:', my_boosting_mod_acc)

In [None]:
# 3 CLASS CASE (CNN)

In [None]:
# Read the data for 3 classes (0,1,2)
my_path = 'data/sentiment_data_1500_manual.csv'
my_data = pd.read_csv(my_path, usecols = ['text', 'label'], encoding = 'latin-1')
my_data.loc[my_data.label == 1, 'label'] = 2
my_data.loc[my_data.label == 0, 'label'] = 1
my_data.loc[my_data.label == -1, 'label'] = 0

my_labels = my_data['label'].values
my_labels = to_categorical(my_labels)

In [None]:
## Very sophisticated Keras Tokenizer

# Two hyperparameters for the tokenizer
tokenizer_max_features = 5000
tokenizer_maxlen = 40

# Create the tokenizer
tokenizer = Tokenizer(num_words = tokenizer_max_features, split = ' ')
tokenizer.fit_on_texts(my_data['text'].values)

# Use the tokenizer
my_tweets = tokenizer.texts_to_sequences(my_data['text'].values)
my_tweets = pad_sequences(my_tweets, maxlen = tokenizer_maxlen)


In [None]:
# Generate a train and val split 
my_train_prop = 0.66
X_train, X_val, y_train, y_val = train_test_split(my_tweets, my_labels, 
                                                  train_size = my_train_prop, test_size  = 1 - my_train_prop, 
                                                  random_state = 1)

In [None]:
## CNN
print('\n')
print('Training CNN..')
print('\n')

# Model hyperparameters
my_max_features = tokenizer_max_features
my_embedding_dims = 5
my_maxlen = tokenizer_maxlen

my_filters = 64
my_kernel_size = 3
num_hidden_dims = 32

# Get model Architecture
my_cnn = get_base_cnn(max_features = my_max_features, 
                      embedding_dims = my_embedding_dims, 
                      maxlen = my_maxlen, 
                      num_conv_filters = my_filters, 
                      kernel_size = my_kernel_size, 
                      num_hidden_dims = num_hidden_dims)

# Compile the model
my_cnn.compile(loss = 'categorical_crossentropy',
               optimizer = 'adam',
               metrics = ['categorical_accuracy'])

# training hyperparameters
num_batch_size = 8
num_epochs = 20



In [None]:
my_cnn.summary()

In [None]:
# Fit the model
my_model = my_cnn.fit(X_train, y_train,
                      batch_size = num_batch_size,
                      epochs = num_epochs,
                      validation_data = (X_val, y_val),
                      verbose = 1)

print('CNN accuracy:', my_model.history['val_categorical_accuracy'][-1])

In [None]:
# Compute confusion matrix
from models import confusion_matrix # not the same as confusion_matrix from sklearn, so we import it here

#Calculate prediction of CNN (provides "probability" of each label)
prediction = my_cnn.predict(X_val)

# Choose "safest" label
my_pred = np.argmax(prediction, axis=1)
my_ground_truth = np.argmax(y_val, axis=1)

# Conf Matrix
conf = confusion_matrix(my_pred, my_ground_truth, 3)

In [None]:
# Calculate accuracy score
np.sum(np.diag(conf))/np.sum(conf)