In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
import seaborn as sn
import pandas as pd
import numpy as np
import random   
import pickle
import spacy
import csv


# Files and folders paths
ABSOLUTE_PATH = './'
DATASET_PATH = ABSOLUTE_PATH + './datasets/'
X_pkl_file = ABSOLUTE_PATH + './X.pkl'
TESTS_FILENAME = ABSOLUTE_PATH + './results/tests.csv'

# Number of features to be extracted from the embedding
NUM_FEATURES_VECTOR = 300

# Number of words
VOCABULARY_SIZE = 5000

# Number of words in a text (more will be cut, less will be padded)
WORDS_IN_SENTENCE = 1000

In [2]:
def get_features(documents, features=['percentage_punctuation', 'percentage_uppercase']):
    """
    Returns array of features to pass as input to the model
    
    :param documents: list of string
    
    :return: list of features
    """    
    output = []
    for document in documents:
        document_features = []
        
        # Total number of characters to compute percentage
        num_chars = len(document)
        
        if 'percentage_punctuation' in features:
            # Lambda function to count characters in string
            count = lambda l1,l2: sum([1 for x in l1 if x in l2])
            num_punct = count(document, set(string.punctuation))
            percentage_punct = num_punct / num_chars if num_chars != 0 else 0
            document_features.append(percentage_punct)
        
        if 'percentage_uppercase' in features:
            num_upper = sum(map(str.isupper, document))
            percentage_upper = num_upper / num_chars if num_chars != 0 else 0
            document_features.append(percentage_upper)
        
        output.append(document_features)
    
    return pd.DataFrame(output, columns=features)
    

### Load the dataset

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
import string

df = pd.read_csv(DATASET_PATH + 'train.csv')
df.dropna(how="any", inplace = True)

# Take the first 1000 only
X_ = df['text'][:1000]
y = df['label'][:1000]

documents_per_class = {
    label: y[y == label].count() for label in y.unique()
}

# Create train (80%), validation (10%), and test (10%) sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X_, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

X_ = {
    'train': X_train,
    'val': X_val,
    'test': X_test
}
X_features = {
    'train': get_features(X_train),
    'val': get_features(X_val),
    'test': get_features(X_test)
}
y = {
    'train': y_train.astype(float),
    'val': y_val.astype(float),
    'test': y_test.astype(float)
}

print("Training size:", X_['train'].shape[0])
print("Validation size:", X_['val'].shape[0])
print("Test size:", X_['test'].shape[0])
print("Number of documents for each class:", documents_per_class)

Training size: 800
Validation size: 100
Test size: 100
Number of documents for each class: {1: 430, 0: 570}


### Preprocessing

In [4]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Try opening the file with the text already processed
# otherwise process the text (it takes a while, that's why it's better to save it in a file)
try:
    f = open(X_pkl_file, 'rb')
    X = pickle.load(f)
    f.close()
except FileNotFoundError:
    X = {}
    for part, documents in X_.items(): # part = ['train', 'val', 'test']
        X[part] = []
        for document in tqdm(documents):
            # Lowercase (features like number of uppercase have already been computed)
            document = document.lower()
            
            # Get the lemma of each word
            document = [word.lemma_ for word in nlp(document)]
            
            # Put again words together in a text
            document = ' '.join(document)
            
            # Append the document to X
            X[part].append(document)
        
        # One hot encoding of the words
        one_hot_rep = [one_hot(document, VOCABULARY_SIZE) for document in X[part]]
        
        # Pad if the text has less than WORDS_IN_SENTENCE words
        X[part] = pad_sequences(one_hot_rep, padding='pre', maxlen=WORDS_IN_SENTENCE)

        # Save everything in a file to avoid processing again next time
        f = open(X_pkl_file, 'wb')
        pickle.dump(X, f)
        f.close()

### Neural network

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Activation
from tensorflow.keras import Model

def build_nn(params):
    """
    Function to build a feed-forward neural network using tf.keras.Functional model.

    Arguments:
    params (dict): A dictionary containing the following parameter data:
                    features (list of strings): The features to use
                    loss (string): The type of loss to optimize ('binary_crossentropy' or 'mse)
                    optimizer (string): The type of optimizer to use while training ('sgd' or 'adam')
                    epochs (int): The number of epochs

    Returns:
    model (tf.keras.Functional), a compiled model created using the specified parameters
    """

    # Using a functional model to allow multiple inputs: https://www.tensorflow.org/guide/keras/functional
    
    # Inputs
    input_text = Input(shape=(WORDS_IN_SENTENCE,))
    input_features = Input(shape=(len(params['features']),))
    
    # Text part
    embed = Embedding(VOCABULARY_SIZE, NUM_FEATURES_VECTOR, input_length=WORDS_IN_SENTENCE)(input_text)
    lstm = LSTM(300, dropout=0.3, recurrent_dropout=0.3)(embed)
    
    # Concatenation
    if len(params['features']) == 0:
        conc = lstm
    else:
        # Features part
        dense_features_1 = Dense(10, activation='relu')(input_features)
        dense_features_2 = Dense(10, activation='relu')(dense_features_1)
        
        conc = Concatenate()([lstm, dense_features_2])
    
    # Final part
    drop = Dropout(0.3)(conc)
    dense = Dense(1)(drop)
    activation = Activation('sigmoid')(dense)

    # Model
    model = Model([input_text, input_features], activation)
    model.compile(loss=params['loss'], optimizer=params['optimizer'])

    return model

### Train the model + tuning

In [6]:
# Plot model

# from tensorflow.keras.preprocessing.text import Tokenizer
# tokenizer = Tokenizer(num_words=5000)
# embedding_matrix = np.zeros((30, 100))
# for word, index in tokenizer.word_index.items():
#     embedding_vector = embeddings_dictionary.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[index] = embedding_vector



# from tensorflow.keras.utils import plot_model
# model = build_nn({
#     'loss': 'binary_crossentropy',
#     'optimizer': 'adam',
#     'epochs': 5,
#     'features': ['percentage_punctuation', 'percentage_uppercase']
# })
# plot_model(model, to_file='model_2_features.pdf', show_shapes=True, show_layer_names=True)

In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import ParameterGrid
from time import time
from utils import reset_seeds

# df_results contains already done tests to avoid training again the network
# for something that has already been computed
try:
    df_results = pd.read_csv(TESTS_FILENAME)
except FileNotFoundError:
    df_results = pd.DataFrame(columns=['features', 'loss', 'optimizer', 'epochs', 'filename'])

param_grid = {
    'loss': ['binary_crossentropy'],
    'optimizer': ['adam'],
    'epochs': [5, 10, 20]
}

for features in [
                 [],
                 ['percentage_punctuation'],
                 ['percentage_uppercase'],
                 ['percentage_punctuation', 'percentage_uppercase']
                ]:
    results = []
    best_model = None
    best_model_value = None

    for parameters in ParameterGrid(param_grid):
        parameters['features'] = features
        print(parameters)

        # Check if already evaluated this combination
        if len(features) == 0:
            df_features_mask = ((df_results['features'] == '') | (df_results['features'].isnull()))
        else:
            df_features_mask = ((df_results['features'] == ','.join(features)))

        # If so, then skip this training
        if df_results[
            df_features_mask &
            (df_results['loss'] == parameters['loss']) &
            (df_results['optimizer'] == parameters['optimizer']) &
            (df_results['epochs'] == parameters['epochs'])
        ].shape[0] > 0:
            print("Current combination of features and parameters already exists")
            continue

        # Build the network
        reset_seeds()
        model = build_nn(parameters)

        # Train the network
        reset_seeds()
        input_model_fit = X['train'] if len(features) == 0 else [X['train'], X_features['train'][features]]
        model.fit(input_model_fit, y['train'], epochs=parameters['epochs'])
        
        # Evaluate the network on validation set
        input_model_predict = X['val'] if len(features) == 0 else [X['val'], X_features['val'][features]]
        y_pred = model.predict(input_model_predict)
        y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred]

        # Save predictions and true values on a file
        current_time = str(time())
        results_filename = ABSOLUTE_PATH + './results/results_'+ current_time +'.csv'
        pd.DataFrame({'true':y['val'], 'pred':y_pred}).to_csv(results_filename, index=False)

        # Add this result to the dataframe and update the file containing all the hyperparameters tested
        df_results = df_results.append([{
            'features': ','.join(features),
            'loss': parameters['loss'],
            'optimizer': parameters['optimizer'],
            'epochs': parameters['epochs'],
            'filename': results_filename,
        }])
        df_results.to_csv(TESTS_FILENAME, index=False)

        # Save model to file
        model.save(ABSOLUTE_PATH + './results/models/'+current_time+'.h5')

{'epochs': 5, 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'features': []}
Current combination of features and parameters already exists
{'epochs': 10, 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'features': []}
Current combination of features and parameters already exists
{'epochs': 20, 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'features': []}
Current combination of features and parameters already exists
{'epochs': 5, 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'features': ['percentage_punctuation']}
Current combination of features and parameters already exists
{'epochs': 10, 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'features': ['percentage_punctuation']}
Current combination of features and parameters already exists
{'epochs': 20, 'loss': 'binary_crossentropy', 'optimizer': 'adam', 'features': ['percentage_punctuation']}
Current combination of features and parameters already exists
{'epochs': 5, 'loss': 'binary_crossentropy', 'optimizer': 'a

### Evaluate the best model on test set

We found the best hyperparameters among the ones we tried: great! Now we take the hyperparameters that performed the best on the validation set and we use the test set to evaluate our model. If the metric are widely different from the ones we got with the evaluation set, then we have to be careful because it's very likely that we overfitted on the validation set.

In [8]:
best_models = [
    {
        'features': [],
        'time': '1607114853.1603158',
        'loss': 'binary_crossentropy',
        'optimizer': 'adam',
        'epochs': 20
    },
    {
        'features': ['percentage_punctuation'],
        'time': '1607117869.2251015',
        'loss': 'binary_crossentropy',
        'optimizer': 'adam',
        'epochs': 10
    },
    {
        'features': ['percentage_uppercase'],
        'time': '1607169573.7067382',
        'loss': 'binary_crossentropy',
        'optimizer': 'adam',
        'epochs': 20
    },
    {
        'features': ['percentage_punctuation','percentage_uppercase'],
        'time': '1607178480.4414513',
        'loss': 'binary_crossentropy',
        'optimizer': 'adam',
        'epochs': 20
    },
]

for parameters in best_models:
    print("Parameters:", parameters['features'])
    
    try:
        model = tf.keras.models.load_model(ABSOLUTE_PATH + './results/models/' + parameters['time'] + '.h5')

    # If model has not been saved
    except OSError:
        # Build the model
        reset_seeds()
        model = build_nn(parameters)
        
        # Train the model
        reset_seeds()
        input_model_fit = X['train'] if len(parameters['features']) == 0 else [X['train'], X_features['train'][parameters['features']]]
        model.fit(input_model_fit, y['train'], epochs=parameters['epochs'])

        # Save the trained model
        model.save(ABSOLUTE_PATH + './results/models/' + parameters['time'] + '.h5')
    
    # Evaluate the model on the test set
    input_model_predict = X['test'] if len(parameters['features']) == 0 else [X['test'], X_features['test'][parameters['features']]]
    y_pred = model.predict(input_model_predict)
    y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred]

    # Compute accuracy, precision, recall, f1
    a = accuracy_score(y['test'], y_pred)
    p, r, f1, s = precision_recall_fscore_support(y['test'], y_pred)

    # Print the metrics
    print("Accuracy:", a)
    print("Precision:", p)
    print("Recall:", r)
    print("F1:", f1)

Parameters: []
Accuracy: 0.88
Precision: [0.87931034 0.88095238]
Recall: [0.91071429 0.84090909]
F1: [0.89473684 0.86046512]
Parameters: ['percentage_punctuation']
Accuracy: 0.86
Precision: [0.83870968 0.89473684]
Recall: [0.92857143 0.77272727]
F1: [0.88135593 0.82926829]
Parameters: ['percentage_uppercase']
Accuracy: 0.9
Precision: [0.88333333 0.925     ]
Recall: [0.94642857 0.84090909]
F1: [0.9137931  0.88095238]
Parameters: ['percentage_punctuation', 'percentage_uppercase']
Accuracy: 0.84
Precision: [0.82258065 0.86842105]
Recall: [0.91071429 0.75      ]
F1: [0.86440678 0.80487805]
