In [1]:
# !pip install cufflinks

import os
os.environ["HDF5_DISABLE_VERSION_CHECK"] = '1'

import nltk
from tqdm.notebook import tqdm as tqdm
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import cufflinks as cf
import pandas as pd
import random
import csv
import re

# nltk.download('stopwords')


def generate_training_df(df, category, min_length = 5, verbose = True):

    # df refactor
    df = df.reindex(columns=['ID', 'Stance', 'Tweet', 'Target'])
    columns = ['A', 'label', 'tweet', 'target']
    df.columns = columns
    df = df.drop(columns=['A'])
    preview = random.randint(0, df.shape[0]-11)
    print("\ndf:\n\n", df[preview:preview+10]) if verbose else None
    
    # df per category
    df_cat = df[df.target.str.contains(category, case=False, regex=False) == True]
    print(f"\ndf_{category}:\n\n", df_cat.head()) if verbose else None
    print(f"number of samples in df({category}): ", df_cat.shape[0])

    # labels = [-1,0-1]
    for index, label in enumerate(df_cat.label):
        if label == 'AGAINST':
            df_cat.label[index] = -1
        elif label == 'FAVOR':
            df_cat.label[index] = 1
        else:
            df_cat.label[index] = 0

    # filter tweets with len < min_lenght
    df_catFilter = df_cat[df_cat.tweet.apply(lambda x: len(str(x)) > min_length)]
    print(f"number of samples after filter -> len(tweet) > {min_length}: ", df_catFilter.shape[0])

    # shuffle and refactor
    df_catFilter = df_catFilter.sample(frac=1)
    df_catFilter = df_catFilter.drop(columns=['target'])
    df_catFilter.columns = ['label', 'text']
    df_catFilter = df_catFilter.reindex(columns = ['text', 'label'])
    print("finished with category '{}'".format(category))
    
    return df_catFilter


def generate_training_df_silent(df, category, min_length = 0):

    # df refactor
    df = df.reindex(columns=['ID', 'Stance', 'Tweet', 'Target'])
    columns = ['A', 'label', 'tweet', 'target']
    df.columns = columns
    df = df.drop(columns=['A'])
    preview = random.randint(0, df.shape[0]-11)
    df_cat = df[df.target.str.contains(category, case=False, regex=False) == True]

    for index, label in enumerate(df_cat.label):
        if label == 'AGAINST':
            df_cat.label[index] = -1
        elif label == 'FAVOR':
            df_cat.label[index] = 1
        else:
            df_cat.label[index] = 0

    df_catFilter = df_cat[df_cat.tweet.apply(lambda x: len(str(x)) > min_length)]
    df_catFilter = df_catFilter.sample(frac=1)
    df_catFilter = df_catFilter.drop(columns=['target'])
    df_catFilter.columns = ['label', 'text']
    
    return df_catFilter


def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = text.replace('x', '').replace('\d+', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text


def train_eval_multiclass_LSTM(df, category, verbose=False, silent=False):
    #df_cat = df.copy()
    print(f"generating df for {category}...") if verbose else None
    if silent==False:
        df_cat = generate_training_df(df=df, category=category, min_length=2, verbose=verbose)
    else:
        df_cat = generate_training_df_silent(df=df, category=category, min_length=2)
    df_cat = df_cat.reindex(columns=['text', 'label'])
    
    print("tokenizing...") if verbose else None
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
    tokenizer.fit_on_texts(df_cat['text'].values)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index)) if verbose else None
    
    X = tokenizer.texts_to_sequences(df_cat['text'].values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', X.shape) if verbose else None
    Y = pd.get_dummies(df_cat['label']).values
    print('Shape of label tensor:', Y.shape) if verbose else None
    
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
    print("X_train.shape,Y_train.shape", X_train.shape,Y_train.shape) if verbose else None
    print("X_test.shape,Y_test.shape", X_test.shape,Y_test.shape) if verbose else None
    
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(Y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary()) if silent == False else None
    
    cp_callback = ModelCheckpoint(filepath='./' + category.replace(' ', '') + '.checkpoint', save_weights_only=True, verbose=1)
    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=
                        [EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001), cp_callback])
    
    accr = model.evaluate(X_test,Y_test)
    print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
    return history


def visualize_history(history):
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show();

    plt.title('Accuracy')
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='test')
    plt.legend()
    plt.show();

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)
Using TensorFlow backend.


In [2]:
# cufflinks conect to iplot pandas series.
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pd.set_option('mode.chained_assignment', None)

# read data and summarize
corpus = 'train_data_A.txt'
df = pd.read_csv(corpus, header=0, sep='\t', encoding='mac_roman')
print("\nnumber of samples for this corpus: ", df.shape[0])
print(f"\n{df.Target.value_counts()}")
df['Target'].value_counts().sort_values(ascending=False).iplot(kind='bar', yTitle='number of Complaints', 
                                                            title='Number of tweets per target(topic)')


number of samples for this corpus:  2814

Feminist Movement                   664
Hillary Clinton                     639
Legalization of Abortion            603
Atheism                             513
Climate Change is a Real Concern    395
Name: Target, dtype: int64


In [3]:
# read data and pre-process it
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
corpus = 'train_data_A.txt'
df = pd.read_csv(corpus, header=0, sep='\t', encoding='mac_roman')
start = random.randint(0, df.shape[0]-11)
print(df['Tweet'][start:start+10])
df['Tweet'] = df['Tweet'].apply(clean_text)
print('\n', df['Tweet'][start:start+10])

2336    @smalldoctor15: What is postinor popularly use...
2337    So mint!  Listening to @GlennBeck LIVE & he s ...
2338    @WayneSense "In check" is putting it nicely. #...
2339    @ProLifeLiberty Yes, your reasoning is horribl...
2340    Terrible golf last couple days! Goes down on t...
2341    Idiot: how would you feel if your mom aborted ...
2342    @LifeNewsHQ  22wks is too late to be able to h...
2343    Dude i won a #freeshirt from @abort73 ! I neve...
2344    Praying for that innocent lady. She's a victim...
2345    Findin budget compromise a 'process' #Brownbac...
Name: Tweet, dtype: object

 2336         smalldoctor15 postinor popularly used #semst
2337    mint listening glennbeck live talking #margare...
2338    waynesense check putting nicely #margaretsange...
2339    prolifeliberty yes reasoning horrible talking ...
2340    terrible golf last couple days goes notes one ...
2341    idiot would feel mom aborted nothing cause wou...
2342    lifenewshq 22wks late able terminat

# TUNING

In [28]:
# parameters
# static
MAX_NB_WORDS = 50_000
MAX_SEQUENCE_LENGTH = 250
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
epochs = 20 # I have earlystopping so I don't need to tune this.

# category options
categories = ['Atheism', 
              'Climate Change is a Real Concern', 
              'Feminist movement', 
              'Hillary Clinton', 
              'Legalization of abortion']

# model params
EMBEDDING_DIM = [50, 100, 300]
batch_size = [1, 8, 32, 64]
dropout = [0.1, 0.3, 0.5, 0.8]
optimizer = ['Adam', 'SGD', 'Adamax', 'Nadam']
# add learning rate...

# pre-processing params
remove_hash, remove_stopwords, remove_numbers = [[True, False]]*3
remove_stopwords = [True, False]
remove_numbers = [True, False]
min_lenght = [0, 2, 5, 10, 20, 30]
# text = text.replace('x', '').replace('\d+', '')

param_grid = {'EMBEDDING_DIM':EMBEDDING_DIM, 'dropout':dropout, 'optimizer':optimizer}
param_grid

# add all of the LSTM parameter from the LSTM_layer in keras as well

{'EMBEDDING_DIM': [50, 100, 300],
 'dropout': [0.1, 0.3, 0.5, 0.8],
 'optimizer': ['Adam', 'SGD', 'Adamax', 'Nadam']}

In [25]:
from keras.wrappers.scikit_learn import KerasClassifier

model_count = 0

def create_LSTM_model(EMBEDDING_DIM, dropout, optimizer):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=dropout, recurrent_dropout=dropout))
    model.add(Dense(Y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    global model_count
    model_count += 1
    print(f'\nmodel_count: {model_count}')
    return model

# for atheism
df_atheism = generate_training_df_silent(df, categories[0], min_length=0)
df_atheism = df_atheism.reindex(columns=['text', 'label'])
print("tokenizing...") #if verbose else None
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_atheism['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index)) #if verbose else None

X = tokenizer.texts_to_sequences(df_atheism['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape) #if verbose else None
Y = pd.get_dummies(df_atheism['label']).values
print('Shape of label tensor:', Y.shape) #if verbose else None

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print("X_train.shape,Y_train.shape", X_train.shape,Y_train.shape) #if verbose else None
print("X_test.shape,Y_test.shape", X_test.shape,Y_test.shape) #if verbose else None

LSTM_model = KerasClassifier(build_fn=create_LSTM_model, epochs=epochs)
grid = GridSearchCV(estimator=LSTM_model, param_grid=param_grid, n_jobs=None, cv=3)
print("grid search built...")

tokenizing...
Found 2478 unique tokens.
Shape of data tensor: (513, 250)
Shape of label tensor: (513, 3)
X_train.shape,Y_train.shape (461, 250) (461, 3)
X_test.shape,Y_test.shape (52, 250) (52, 3)
grid search built...


In [27]:
# cp_callback = ModelCheckpoint(filepath='./atheism.checkpoint', save_weights_only=True, verbose=1)
results_per_batchsize = []
model_count = 0
for _batch_size in batch_size:
    print(f"\nSTARTING GRIDSEARCH FOR BATCH_SIZE: {_batch_size}")
    grid_result = grid.fit(X_train, Y_train, epochs=epochs, batch_size=_batch_size, validation_split=0.1, callbacks=
                        [EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001, restore_best_weights=True)], 
                           verbose=1)
    results_per_batchsize.append(grid_result)
    print(f'best_estimator: {grid_result.best_estimator}\nbest_score: {grid_result.best_score}\nbest_params: {grid_result.best_params}')


STARTING GRIDSEARCH FOR BATCH_SIZE: 1

model_count: 1
Train on 276 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

model_count: 2
Train on 276 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

model_count: 3
Train on 277 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

model_count: 4
Train on 276 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

model_count: 5
Train on 276 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

model_count: 6
Train on 277 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

model_count: 7
Train on 276 samples, validate on 31 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

model_count: 8
Train on 276 samples, validate on 31 samples
Epoch 1/20
Epoch 2/

KeyboardInterrupt: 

In [None]:
for category in categories:
    print(f"\n{category}\n")
    df_copy = df.copy()
    visualize_history(train_eval_multiclass_LSTM(df_copy, category, silent=True))

In [None]:
for category in categories:
    print(f"\n{category}\n")
    df_copy = df.copy()
    visualize_history(train_eval_multiclass_LSTM(df_copy, category, verbose=False, silent=False))

In [None]:
# here I am supposed to evaluate against the challenge test intead of mine which is actually a validation.

import numpy as np

# single predictions
new_complaint_atheist = ['Better to gather your friends and family around you and to love life now than to mistakenly believe in the \
                 promises of heaven or that God has a monopoly on goodness.']

new_complaint_climate = ['There is an overwhelming consensus among climate scientists that our climate is changing and humans \
                         are to blame. When we burn fossil fuels, we pump heat-trapping gases into our atmosphere that cause \
                         temperatures to rise. And just like the health of our families, climate change should never be a \
                         partisan issue!']

seq = tokenizer.texts_to_sequences(new_complaint_atheist)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['AGAINST', 'FAVOR', 'NONE']
print(pred, labels[np.argmax(pred)])

seq = tokenizer.texts_to_sequences(new_complaint_climate)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['AGAINST', 'FAVOR', 'NONE']
print(pred, labels[np.argmax(pred)])

In [None]:
# Ideas: model is overfitting the data... less epochs? change batch? lr?
# hyperparam tuning and extra preprocessing should be able to push it to 80% and more
# Then add more data (at least *10) and re train. This is your best model.