In [9]:
from Functions import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [10]:
df = pd.read_csv('data/TweetsOriginal.csv', encoding = 'ISO-8859-1')
df.rename(columns = {'tweet_text': 'tweet', 'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'}, inplace = True)
df.drop('emotion_in_tweet_is_directed_at', axis = 1, inplace = True)

df.emotion = df.emotion.replace({'Negative emotion': 0, 'Positive emotion': 1, 'No emotion toward brand or product': None, 
                                "I can't tell": None})
df.dropna(subset = ['emotion', 'tweet'], inplace = True)

In [11]:
print('Original Value Counts')
print(df.emotion.value_counts())
df.head()

Original Value Counts
1.0    2978
0.0     570
Name: emotion, dtype: int64


Unnamed: 0,tweet,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0.0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,1.0
2,@swonderlin Can not wait for #iPad 2 also. The...,1.0
3,@sxsw I hope this year's festival isn't as cra...,0.0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,1.0


In [12]:
pos_df = df[df.emotion == 1]
neg_df = df[df.emotion ==0]

resamp_pos = resample(pos_df, n_samples = 600, replace = False, random_state = 10)

new_df = neg_df.append(resamp_pos, ignore_index = True)

x_train, x_test, y_train, y_test = train_test_split(new_df.tweet, new_df.emotion, stratify = new_df.emotion, random_state = 10, 
                                                   train_size = .85)

In [13]:
print('Resampled Value Counts')
print(new_df.emotion.value_counts())
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

print('Xtrain Value Value Counts')
print(y_train.value_counts())
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

print('XTest value Counts')
print(y_test.value_counts())

Resampled Value Counts
1.0    600
0.0    570
Name: emotion, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Xtrain Value Value Counts
1.0    510
0.0    484
Name: emotion, dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
XTest value Counts
1.0    90
0.0    86
Name: emotion, dtype: int64


In [14]:
vocab_size = 1000 
embedding_dim = 16
max_length = 280
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [26]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)

tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index


train_seq = tokenizer.texts_to_sequences(x_train)
train_seq = pad_sequences(train_seq, maxlen = max_length, padding = padding_type, truncating = trunc_type)
test_seq = tokenizer.texts_to_sequences(x_test)
test_seq = pad_sequences(test_seq, maxlen = max_length, padding = padding_type, truncating = trunc_type)

test_labels = y_test.values
train_labels = y_train.values

### Neural Network

In [27]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten 
from keras.layers import Conv1D, GlobalMaxPooling1D, BatchNormalization, Embedding, LSTM, Bidirectional
from keras import regularizers
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [28]:
def build_model(): 
    
    model = Sequential() 
    model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))
    model.add(LSTM(embedding_dim, return_sequences = False))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

In [29]:
model = build_model()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 280, 16)           16000     
_________________________________________________________________
lstm_4 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 33        
Total params: 18,689
Trainable params: 18,689
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience=5)
model_checkpoint = ModelCheckpoint(f'../Weights/Test-ModelCheckpointWeights.h5', verbose = 1, save_best_only=True,
                                  monitor = 'val_loss')
lr_plat = ReduceLROnPlateau(patience = 3, mode = 'min')

calls = [early_stopping, model_checkpoint]
epochs = 10 
batch_size = 16

model_history = model.fit(train_seq, train_labels, epochs = epochs, batch_size = batch_size, 
                         validation_data = (test_seq, test_labels), callbacks = calls)

Train on 994 samples, validate on 176 samples
Epoch 1/10


INFO:plaidml:Analyzing Ops: 1070 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 3019 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 5375 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 8489 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 13249 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 18526 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 22750 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 25527 of 27941 operations complete




INFO:plaidml:Analyzing Ops: 1486 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 3523 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 6084 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 9563 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 14710 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 20036 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 23527 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 26321 of 27941 operations complete
INFO:plaidml:Analyzing Ops: 3945 of 11549 operations complete
INFO:plaidml:Analyzing Ops: 9902 of 11549 operations complete



Epoch 00001: val_loss improved from inf to 0.69290, saving model to ../Weights/Test-ModelCheckpointWeights.h5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.69290
Epoch 3/10

Epoch 00003: val_loss improved from 0.69290 to 0.69289, saving model to ../Weights/Test-ModelCheckpointWeights.h5
Epoch 4/10

In [None]:
def get_nn_model(x, model_type, act):
    model = Sequential()

    if model_type == 'embedding':
        model.add(Embedding(5000, 100, input_length = len(x[0]), trainable = False, name = 'Input'))
        model.add(Flatten())
        model.add(Dense(32, activation = act))
        
    elif model_type == 'normal': 
        model.add(Dense(32, activation = act, input_shape = x[0].shape, name = 'Input'))
        model.add(Dense(64, activation = act))
        model.add(Dense(128, activation = act))

        
    elif model_type == 'cnn': 
#         model.add(Embedding(5000, 100, input_length = len(x[0]), trainable = False,  name = 'Input'))
        model.add(Conv1D(128, 5, activation = act, ))
        model.add(GlobalMaxPooling1D())
        model.add(Dense(64, activation = act))
    
    elif model_type == 'LSTM': 
        model.add(Embedding(5000, 100, input_length = len(x[0]), trainable = False,  name = 'Input'))
        model.add(LSTM(128, activation = act))


    model.add(Dense(1, activation = 'sigmoid', name = 'OutputLayer'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model



### Normal Model

In [None]:
#normal
model_type = 'normal'
act = 'sigmoid'
model = get_nn_model(x_train, model_type, act)
print(model.summary())

#cnn
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience=20)
model_checkpoint = ModelCheckpoint(f'../Weights/{model_type.upper()}_{act}-ModelCheckpointWeights.h5', verbose = 1, save_best_only=True,
                                  monitor = 'val_loss')
lr_plat = ReduceLROnPlateau(patience = 3, mode = 'min')

callbacks = [early_stopping, model_checkpoint]
epochs = 100 
batch_size= 32

model_history = model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size, 
                         validation_data = (x_test, y_test), callbacks = callbacks)

In [None]:
plot_loss(model_history, model_type, act)

In [None]:
model = get_nn_model(x_train, model_type, act)
model.load_weights(f'../Weights/{model_type.upper()}_{act}-ModelCheckpointWeights.h5')

get_roc_auc(model,model_type, act, x_train, y_train, x_test, y_test)

In [None]:
y_test_prob = model.predict(x_test).ravel() 
y_train_prob = model.predict(x_train).ravel() 

plot_cm(y_train = y_train, y_test = y_test, y_train_prob = y_train_prob,
                      y_test_prob = y_test_prob,classes = ['Negative', 'Positive'], thresholds = [.2, .5,.6], 
        model_type = model_type, act = act)

## CNN

In [None]:
#normal
model_type = 'cnn'
act = 'relu'
model = get_nn_model(x_train, model_type, act)
print(model.summary())

#cnn
early_stopping = EarlyStopping(monitor='val_loss', verbose = 1, patience=5)
model_checkpoint = ModelCheckpoint(f'../Weights/{model_type.upper()}_{act}-ModelCheckpointWeights.h5', verbose = 1, save_best_only=True,
                                  monitor = 'val_loss')
lr_plat = ReduceLROnPlateau(patience = 3, mode = 'min')

callbacks = [early_stopping, model_checkpoint]
epochs = 100 
batch_size= 32

model_history = model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size, 
                         validation_data = (x_test, y_test), callbacks = callbacks)

In [None]:
plot_loss(model_history, model_type, act)

In [None]:
model = get_nn_model(x_train, model_type, act)
model.load_weights(f'../Weights/{model_type.upper()}_{act}-ModelCheckpointWeights.h5')

get_roc_auc(model,model_type, act, x_train, y_train, x_test, y_test)

In [None]:
y_test_prob = model.predict(x_test).ravel() 
y_train_prob = model.predict(x_train).ravel() 

plot_cm(y_train = y_train, y_test = y_test, y_train_prob = y_train_prob,
                      y_test_prob = y_test_prob,classes = ['Negative', 'Positive'], thresholds = [.2, .5,.6], 
        model_type = model_type, act = act)

In [None]:
assert False