In [0]:
# Import libs

import pandas as pd
import numpy as np
import keras
import nltk
import string
import re
from nltk.corpus import stopwords

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,Conv1D,GlobalMaxPooling1D,Dense,MaxPooling1D,Flatten,Input
from keras.models import Model
from keras import regularizers

In [0]:
# Load train,dev and test datasets

def loadData(filepath):
  data = pd.read_csv(filepath)
  # Currently working on region level predictor
  data.drop(['lat','long','state'],axis=1,inplace=True)
  # Filter and remove anomaly data
  data = data[data.region != 'MX']  # Remove 'Mexico' records - Due to Reverse geocoding
  data = data[data.region != 'CA']  # Remove 'Canada' records - Due to Reverse geocoding
  data.dropna(inplace=True)
  data.reset_index(drop=True,inplace=True)
  return data
 
# train file and dev file has columns tweet, lat, long, state, region
# test file has columns uid (userid), tweet, lat, long, state, region
filename1= #TRAIN_PATH
filename2= #TEST_PATH
filename4= #DEV_PATH
glove_file = #EMBEDDING_FILE_PATH

train_df = pd.read_csv(filename1)
dev_df = pd.read_csv(filename4)
test_df = pd.read_csv(filename2)

frames = [train_df,dev_df]
merge_train_df = pd.concat(frames)
#print(merge_train_df.head())

test = loadData(#TEST_PATH)
tweet_test = new = test.drop('uid', axis=1)
user_test = test.drop('tweet', axis=1).drop_duplicates()

In [0]:
# Label Preparation

train_Y = merge_train_df['region']
dev_Y = dev_df['region']
test_Y = test_df['region']
#print(train_Y.head())

from sklearn.preprocessing import LabelBinarizer, LabelEncoder

encoder = LabelEncoder()
encoder.fit(train_Y)
train_Y = encoder.transform(train_Y)
dev_Y = encoder.transform(dev_Y)
test_Y = encoder.transform(test_Y)

num_classes = np.max(train_Y) + 1
#print(num_classes)

train_Y = utils.to_categorical(train_Y, num_classes)
dev_Y = utils.to_categorical(dev_Y, num_classes)
test_Y = utils.to_categorical(test_Y, num_classes)


In [0]:
# Data Preperation

"""
preprocess-twitter.py

Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu

Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text.lower()

In [0]:
# Tokenization

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
x_train = merge_train_df["tweet"]
x_test = test_df["tweet"]
x_dev = dev_df["tweet"]

x_train = x_train.apply(tokenize)
x_dev = x_dev.apply(tokenize)
x_test = x_test.apply(tokenize)
#print(train_X.head())

tokenizer.fit_on_texts(x_train)
#print(len(tokenizer.word_index))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_dev = tokenizer.texts_to_sequences(x_dev)
#print(x_train[0])

vocab_size = len(tokenizer.word_index) + 1
#print(vocab_size)

In [0]:
# Create embedding matrix

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [0]:
# Vectorization

maxlen = 25
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_dev = pad_sequences(x_dev, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)
#print(x_train[0, :])

embedding_dim = 200
embedding_matrix = create_embedding_matrix(glove_file,tokenizer.word_index, embedding_dim)
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
vocab_coverage = nonzero_elements / vocab_size

In [0]:
# CNN Model

batch_size = 40000
epochs = 50

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],input_length=maxlen,trainable=True))
model.add(Conv1D(64, 5, activation='relu',padding='same'))
model.add(Dropout(0.5))
model.add(MaxPooling1D())
model.add(Conv1D(64, 4, activation='relu',padding='same'))
model.add(Dropout(0.5))
model.add(MaxPooling1D())
model.add(Conv1D(64, 5, activation='relu',padding='same'))
model.add(Dropout(0.5))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.summary()

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [0]:
# Hyperparamter Optimization

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen,layers):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],input_length=maxlen,trainable=False))
    model.add(Conv1D(num_filters, kernel_size, activation='relu',padding='same'))
    model.add(MaxPooling1D(padding="same"))
    for i in range(layers):
      model.add(Conv1D(num_filters, kernel_size, activation='relu',padding='same'))
      model.add(MaxPooling1D(padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model
  
  param_grid = dict(num_filters=[64],
                  kernel_size=[5],
                  vocab_size=[vocab_size], 
                  embedding_dim=[embedding_dim],
                  maxlen=[10],
                  layers=[3,4,5],
                  )
  
  model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=batch_size,
                            verbose=1)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,cv=3, verbose=1, n_iter=17)
grid_result = grid.fit(x_train, train_Y)

test_accuracy = grid.score(x_test, test_Y)

output_file = # OUTPUT_FILE_PATH_FOR_HYPER_PARAMS
with open(output_file, 'a') as f:
        s = ('Best Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
        output_string = s.format(
            grid_result.best_score_,
            grid_result.best_params_,
            test_accuracy)
        print(output_string)
        f.write(output_string)
print(grid_result.best_params_)

In [0]:
# Train and Predict CNN model

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

filepath= # PATH_TO_SAVE_MODEL
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
callbacks_list = [checkpoint,es]

history = model.fit(x_train, train_Y,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, test_Y),
                    callbacks=callbacks_list)

score = model.evaluate(x_test, test_Y,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])


In [0]:
# Plot training and validation loss and accuracy

import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    
    plot_history(history)

In [0]:
# Evaluation of best CNN model


from sklearn.metrics import accuracy_score

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],input_length=maxlen,trainable=True))
model.add(Conv1D(64, 5, activation='relu',padding='same'))
model.add(Dropout(0.3))
model.add(MaxPooling1D())
model.add(Conv1D(64, 4, activation='relu',padding='same'))
model.add(Dropout(0.3))
model.add(MaxPooling1D())
model.add(Conv1D(64, 5, activation='relu',padding='same'))
model.add(Dropout(0.25))
model.add(MaxPooling1D())
model.add(Conv1D(64, 5, activation='relu',padding='same'))
model.add(Dropout(0.25))
model.add(MaxPooling1D())
model.add(Conv1D(64, 5, activation='relu',padding='same'))
model.add(Dropout(0.2))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(4))
model.add(Activation('softmax'))

model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.load_weights("PATH_TO_BEST_MODEL")

cnn_loss, cnn_accuracy = model.evaluate(x_test, test_Y, verbose=False)
print("CNN 1D Best Tweet Level Test Accuracy:  {:.2f}".format(cnn_accuracy*100))

y_pred = model.predict(x_test)
#print(y_pred.shape)

#print(y_pred[0])
y_pred = np.argmax(y_pred, axis=1)
#print(y_pred[0])
#print(y_pred.shape)

test['ypred'] = y_pred
user_pred = test.drop(['tweet','region'],axis=1)
# print(user_pred.head(5))
result = user_pred.groupby(['uid'])['ypred'].agg(lambda x:x.value_counts().index[0])
#print(result.head(10))
true = user_test.sort_values(by=['uid']).replace({ 'midwest': 0, 'northeast': 1, 'south': 2, 'west' : 3 })
#print(true.head(10))

# print(result[0])
final_accuracy = accuracy_score(true['region'], result)
print("CNN 1D Final Accuracy:  {:.2f}".format(final_accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 200)           20210800  
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 25, 64)            64064     
_________________________________________________________________
dropout_6 (Dropout)          (None, 25, 64)            0         
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 12, 64)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 12, 64)            16448     
_________________________________________________________________
dropout_7 (Dropout)          (None, 12, 64)            0         
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 6, 64)             0         
__________