### We are gonna implement basic components in a step by step manner in order to create a text classification framework in python. To start with, import all the required libraries.
* Pandas
* Scikit-learn
* XGBoost
* TextBlob
* Keras
### Libraries for dataset preparation, feature engineering, model training

In [42]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline, FeatureUnion

import pandas, xgboost, numpy, textblob, string, scipy, warnings
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from collections.abc import Callable

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

warnings.filterwarnings('always')

In [8]:
# load the dataset
dt = open('corpus', encoding='utf-8')
data = dt.read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

dt.close()

In [9]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

## 2. Feature Engineering
The next step is the feature engineering step. In this step, raw text data will be transformed into feature vectors and new features will be created using the existing dataset. We will implement the following different ideas in order to obtain relevant features from our dataset.

2.1 Count Vectors as features<br>
2.2 TF-IDF Vectors as features<br>
* Word level
* N-Gram level
* Character level

2.3 Word Embeddings as features<br>
2.4 Text / NLP based features<br>
2.5 Topic Models as features<br>

### Implementation of these ideas in detail.

2.1 Count Vectors as features<br>
Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.<br>
Links<br>
* https://towardsdatascience.com/introduction-to-word-embeddings-4cf857b12edc
* https://towardsdatascience.com/natural-language-processing-count-vectorization-with-scikit-learn-e7804269bb5e

In [10]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [13]:
xtrain_count_and_tfidf = scipy.sparse.hstack((xtrain_count, xtrain_tfidf))
xvalid_count_and_tfidf = scipy.sparse.hstack((xvalid_count, xvalid_tfidf))

xtrain_count_and_tfidf_ngram = scipy.sparse.hstack((xtrain_count, xtrain_tfidf_ngram))
xvalid_count_and_tfidf_ngram = scipy.sparse.hstack((xvalid_count, xvalid_tfidf_ngram))

xtrain_count_and_tfidf_ngram_chars = scipy.sparse.hstack((xtrain_count, xtrain_tfidf_ngram_chars))
xvalid_count_and_tfidf_ngram_chars = scipy.sparse.hstack((xvalid_count, xvalid_tfidf_ngram_chars))

xtrain_tfidf_and_ngram = scipy.sparse.hstack((xtrain_tfidf, xtrain_tfidf_ngram))
xvalid_tfidf_and_ngram = scipy.sparse.hstack((xvalid_tfidf, xvalid_tfidf_ngram))

xtrain_tfidf_and_ngram_chars = scipy.sparse.hstack((xtrain_tfidf, xtrain_tfidf_ngram_chars))
xvalid_tfidf_and_ngram_chars = scipy.sparse.hstack((xvalid_tfidf, xvalid_tfidf_ngram_chars))

xtrain_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xtrain_tfidf_ngram, xtrain_tfidf_ngram_chars))
xvalid_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xvalid_tfidf_ngram, xvalid_tfidf_ngram_chars))


xtrain_count_and_tfidf_and_ngram = scipy.sparse.hstack((xtrain_count, xtrain_tfidf_and_ngram))
xvalid_count_and_tfidf_and_ngram = scipy.sparse.hstack((xvalid_count, xvalid_tfidf_and_ngram))

xtrain_count_and_tfidf_and_ngram_chars = scipy.sparse.hstack((xtrain_count, xtrain_tfidf_and_ngram_chars))
xvalid_count_and_tfidf_and_ngram_chars = scipy.sparse.hstack((xvalid_count, xvalid_tfidf_and_ngram_chars))

xtrain_count_and_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xtrain_count, xtrain_tfidf_ngram_and_ngram_chars))
xvalid_count_and_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xvalid_count, xvalid_tfidf_ngram_and_ngram_chars))

xtrain_tfidf_and_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xtrain_tfidf, xtrain_tfidf_ngram_and_ngram_chars))
xvalid_tfidf_and_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xvalid_tfidf, xvalid_tfidf_ngram_and_ngram_chars))

xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xtrain_count, xtrain_tfidf_and_tfidf_ngram_and_ngram_chars))
xvalid_count_and_tfidf_and_tfidf_ngram_and_ngram_chars = scipy.sparse.hstack((xvalid_count, xvalid_tfidf_and_tfidf_ngram_and_ngram_chars))

In [14]:
print(type(xtrain_count))
print(xtrain_count.shape)

print(type(xtrain_tfidf))
print(xtrain_tfidf.shape)

print(type(xtrain_tfidf_ngram))
print(xtrain_tfidf_ngram.shape)

print(type(xtrain_tfidf_ngram_chars))
print(xtrain_tfidf_ngram_chars.shape)

print(type(xtrain_count_and_tfidf))
print(xtrain_count_and_tfidf.shape)

print(type(xtrain_count_and_tfidf_ngram))
print(xtrain_count_and_tfidf_ngram.shape)

print(type(xtrain_count_and_tfidf_ngram_chars))
print(xtrain_count_and_tfidf_ngram_chars.shape)

print(type(xtrain_tfidf_and_ngram))
print(xtrain_tfidf_and_ngram.shape)

print(type(xtrain_tfidf_and_ngram_chars))
print(xtrain_tfidf_and_ngram_chars.shape)

print(type(xtrain_tfidf_ngram_and_ngram_chars))
print(xtrain_tfidf_ngram_and_ngram_chars.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(7500, 31666)
<class 'scipy.sparse.csr.csr_matrix'>
(7500, 5000)
<class 'scipy.sparse.csr.csr_matrix'>
(7500, 5000)
<class 'scipy.sparse.csr.csr_matrix'>
(7500, 5000)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 36666)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 36666)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 36666)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 10000)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 10000)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 10000)


In [15]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
dt2 = open('wiki-news-300d-1M.vec', encoding='utf-8')
for i, line in enumerate(dt2):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

dt2.close()

In [16]:
print(type(train_seq_x))
print(train_seq_x.shape)

xtrain_count_and_word_emb = scipy.sparse.hstack((xtrain_count, train_seq_x))
print(type(xtrain_count_and_word_emb))
print(xtrain_count_and_word_emb.shape)

<class 'numpy.ndarray'>
(7500, 70)
<class 'scipy.sparse.coo.coo_matrix'>
(7500, 31736)


In [62]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = [int(round(p[0])) for p in predictions]
        accuracy = metrics.accuracy_score(predictions, valid_y.round())
    else:
        accuracy = metrics.accuracy_score(predictions, valid_y)
    
    return accuracy, metrics.f1_score(predictions, valid_y, average='weighted', labels=numpy.unique(valid_y))

In [18]:
# Naive Bayes on Count Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Word Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

NB, Count Vectors: Accuracy: 0.838 ; F1 score: 0.8382524757930035
NB, WordLevel TF-IDF: Accuracy: 0.8452 ; F1 score: 0.8453507710608293
NB, N-Gram Vectors: Accuracy: 0.8376 ; F1 score: 0.8376332813312533
NB, CharLevel Vectors: Accuracy: 0.812 ; F1 score: 0.8122405385662731


In [19]:
# Naive Bayes on Count Vectors and Word Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf, train_y, xvalid_count_and_tfidf)
print("NB, Count Vectors and Word Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Count Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_ngram, train_y, xvalid_count_and_tfidf_ngram)
print("NB, Count Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Count Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_chars)
print("NB, Count Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_and_ngram, train_y, xvalid_tfidf_and_ngram)
print("NB, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_and_ngram_chars, train_y, xvalid_tfidf_and_ngram_chars)
print("NB, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_and_ngram_chars, train_y, xvalid_tfidf_ngram_and_ngram_chars)
print("NB, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

NB, Count Vectors and Word Level TF IDF Vectors: Accuracy: 0.84 ; F1 score: 0.8402340111617166
NB, Count Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8432 ; F1 score: 0.8434195575551036
NB, Count Vectors and Character Level TF IDF Vectors: Accuracy: 0.8388 ; F1 score: 0.8389830843560374
NB, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.87 ; F1 score: 0.8700029338369354
NB, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8476 ; F1 score: 0.8476850765885363
NB, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.852 ; F1 score: 0.8519909053899392


In [20]:
# Naive Bayes on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_and_ngram, train_y, xvalid_count_and_tfidf_and_ngram)
print("NB, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_ngram_chars)
print("NB, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("NB, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Naive Bayes on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("NB, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

NB, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8464 ; F1 score: 0.8465701869216408
NB, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.84 ; F1 score: 0.8401862165151733
NB, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8468 ; F1 score: 0.8469572767962685
NB, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8468 ; F1 score: 0.8469572767962685


In [21]:
# Naive Bayes on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_tfidf_ngram_and_ngram_chars)
print("NB, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

NB, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8488 ; F1 score: 0.8489512197748936


In [22]:
# Naive Bayes on word-embeddings
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), train_seq_x, train_y, valid_seq_x)
print("NB, Word Embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

NB, Word Embeddings: Accuracy: 0.4932 ; F1 score: 0.4936417379611732


In [27]:
# Linear Classifier on Count Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

LR, Count Vectors: Accuracy: 0.8696 ; F1 score: 0.8695919869111356
LR, WordLevel TF-IDF: Accuracy: 0.8764 ; F1 score: 0.8764205199287448
LR, N-Gram Vectors: Accuracy: 0.8364 ; F1 score: 0.8364271606823839
LR, CharLevel Vectors: Accuracy: 0.8412 ; F1 score: 0.8412153792835732


In [29]:
# Linear Classifier on Count Vectors and Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf, train_y, xvalid_count_and_tfidf)
print("LR, Count Vectors and Word Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Count Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_ngram, train_y, xvalid_count_and_tfidf_ngram)
print("LR, Count Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Count Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_chars)
print("LR, Count Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_tfidf_and_ngram, train_y, xvalid_tfidf_and_ngram)
print("LR, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_tfidf_and_ngram_chars, train_y, xvalid_tfidf_and_ngram_chars)
print("LR, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_tfidf_ngram_and_ngram_chars, train_y, xvalid_tfidf_ngram_and_ngram_chars)
print("LR, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

LR, Count Vectors and Word Level TF IDF Vectors: Accuracy: 0.87 ; F1 score: 0.8699924482841891
LR, Count Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.874 ; F1 score: 0.8739912292198038
LR, Count Vectors and Character Level TF IDF Vectors: Accuracy: 0.8692 ; F1 score: 0.8691934063876521
LR, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8852 ; F1 score: 0.885216264738809
LR, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8764 ; F1 score: 0.8764070629713262
LR, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8788 ; F1 score: 0.8788069258262518


In [30]:
# Linear Classifier on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_and_ngram, train_y, xvalid_count_and_tfidf_and_ngram)
print("LR, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_ngram_chars)
print("LR, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("LR, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Linear Classifier on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("LR, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

LR, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8732 ; F1 score: 0.8731911735323105
LR, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8696 ; F1 score: 0.8695939899217503
LR, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8752 ; F1 score: 0.8751942480232702
LR, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8752 ; F1 score: 0.8751942480232702


In [31]:
# Linear Classifier on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_tfidf_ngram_and_ngram_chars)
print("LR, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

LR, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8744 ; F1 score: 0.874393166096001


In [32]:
# Linear Classifier on word-embeddings
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='lbfgs', max_iter = 4000), train_seq_x, train_y, valid_seq_x)
print("LR, word-embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

LR, word-embeddings: Accuracy: 0.4888 ; F1 score: 0.4893942240704081


In [33]:
# SVM on Count Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count, train_y, xvalid_count)
print("SVM, Count Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Word Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, Word Level TF ID: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Ngram Level TF IDF Vectors
v = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("SVM, CharLevel Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on word-embeddings
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), train_seq_x, train_y, valid_seq_x)
print("SVM, word-embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

SVM, Count Vectors: Accuracy: 0.8488 ; F1 score: 0.8488232377049179
SVM, Word Level TF ID: Accuracy: 0.88 ; F1 score: 0.8800129088771582
SVM, N-Gram Vectors: Accuracy: 0.88 ; F1 score: 0.8800129088771582
SVM, CharLevel Vectors: Accuracy: 0.8512 ; F1 score: 0.8512265860795023
SVM, word-embeddings: Accuracy: 0.52 ; F1 score: 0.5203183646005222


In [34]:
# SVM on Count Vectors and Word Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf, train_y, xvalid_count_and_tfidf)
print("SVM, Count Vectors and Word Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Count Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_ngram, train_y, xvalid_count_and_tfidf_ngram)
print("SVM, Count Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Count Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_chars)
print("SVM, Count Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_and_ngram, train_y, xvalid_tfidf_and_ngram)
print("SVM, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_and_ngram_chars, train_y, xvalid_tfidf_and_ngram_chars)
print("SVM, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_ngram_and_ngram_chars, train_y, xvalid_tfidf_ngram_and_ngram_chars)
print("SVM, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

SVM, Count Vectors and Word Level TF IDF Vectors: Accuracy: 0.8504 ; F1 score: 0.8504267290154137
SVM, Count Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8508 ; F1 score: 0.8508247700110738
SVM, Count Vectors and Character Level TF IDF Vectors: Accuracy: 0.8508 ; F1 score: 0.8508285929416616
SVM, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8852 ; F1 score: 0.8852025908036936
SVM, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8768 ; F1 score: 0.8768132531138825
SVM, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8808 ; F1 score: 0.8808079373860308


In [35]:
# SVM on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_and_ngram, train_y, xvalid_count_and_tfidf_and_ngram)
print("SVM, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_ngram_chars)
print("SVM, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("SVM, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# SVM on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("SVM, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

SVM, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.852 ; F1 score: 0.8520227459016394
SVM, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8516 ; F1 score: 0.8516284396283016
SVM, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.852 ; F1 score: 0.8520227459016394
SVM, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.852 ; F1 score: 0.8520227459016394


In [36]:
# SVM on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(svm.SVC(gamma='scale'), xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_tfidf_ngram_and_ngram_chars)
print("SVM, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

SVM, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8532 ; F1 score: 0.8532207984639124


In [37]:
# Extereme Gradient Boosting on Count Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
print("Xgb, Ngram TF-IDF: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on word-embeddings
accuracy, f1_score = train_model(xgboost.XGBClassifier(), train_seq_x, train_y, valid_seq_x)
print("Xgb, word-embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Xgb, Count Vectors: Accuracy: 0.8116 ; F1 score: 0.811670095577896
Xgb, WordLevel TF-IDF: Accuracy: 0.8092 ; F1 score: 0.8092709885151941
Xgb, Ngram TF-IDF: Accuracy: 0.7436 ; F1 score: 0.7439053147772856
Xgb, CharLevel Vectors: Accuracy: 0.8136 ; F1 score: 0.8135885457073287
Xgb, word-embeddings: Accuracy: 0.5656 ; F1 score: 0.5672662563948182


In [38]:
# Extereme Gradient Boosting on Count Vectors and Word Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf, train_y, xvalid_count_and_tfidf)
print("Xgb, Count Vectors and Word Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Count Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_ngram, train_y, xvalid_count_and_tfidf_ngram)
print("Xgb, Count Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Count Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_chars)
print("Xgb, Count Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_tfidf_and_ngram, train_y, xvalid_tfidf_and_ngram)
print("Xgb, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_tfidf_and_ngram_chars, train_y, xvalid_tfidf_and_ngram_chars)
print("Xgb, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_and_ngram_chars, train_y, xvalid_tfidf_ngram_and_ngram_chars)
print("Xgb, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Xgb, Count Vectors and Word Level TF IDF Vectors: Accuracy: 0.8068 ; F1 score: 0.8068592652166948
Xgb, Count Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8184 ; F1 score: 0.8185125396922951
Xgb, Count Vectors and Character Level TF IDF Vectors: Accuracy: 0.8256 ; F1 score: 0.8256150684010946
Xgb, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8124 ; F1 score: 0.8124697979321299
Xgb, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.824 ; F1 score: 0.8240408914866368
Xgb, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.824 ; F1 score: 0.8239891863108525


In [39]:
# Extereme Gradient Boosting on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_and_ngram, train_y, xvalid_count_and_tfidf_and_ngram)
print("Xgb, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_ngram_chars)
print("Xgb, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("Xgb, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Extereme Gradient Boosting on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars)
print("Xgb, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Xgb, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8132 ; F1 score: 0.8132515619561956
Xgb, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.818 ; F1 score: 0.8180215785034829
Xgb, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.826 ; F1 score: 0.8260333264049332
Xgb, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.826 ; F1 score: 0.8260333264049332


In [40]:
# Extereme Gradient Boosting on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
accuracy, f1_score = train_model(xgboost.XGBClassifier(), xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars, train_y, xvalid_count_and_tfidf_and_tfidf_ngram_and_ngram_chars)
print("Xgb, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Xgb, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8212 ; F1 score: 0.8212342457540348


In [43]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ))
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

# Shallow Neural Network on Ngram Level TF IDF Vectors
classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy, f1_score = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print("NN, Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
NN, Ngram Level TF IDF Vectors: Accuracy: 0.8388 ; F1 score: 0.8389034280405616


In [44]:
# Shallow Neural Network on Count Vectors
classifier = create_model_architecture(xtrain_count.shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count, train_y, xvalid_count, is_neural_net=True)
print("NN, Count Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Word Level TF IDF Vectors
classifier = create_model_architecture(xtrain_tfidf.shape[1])
accuracy, f1_score = train_model(classifier, xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True)
print("NN, Word Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_tfidf_ngram_chars.shape[1])
accuracy, f1_score = train_model(classifier,  xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars, is_neural_net=True)
print("NN, Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on word-embeddings
classifier = create_model_architecture(train_seq_x.shape[1])
accuracy, f1_score = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("NN, word-embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
NN, Count Vectors: Accuracy: 0.8708 ; F1 score: 0.8708125068021316
Epoch 1/1
NN, Word Level TF IDF Vectors: Accuracy: 0.8672 ; F1 score: 0.8676572464676665
Epoch 1/1
NN, Character Level TF IDF Vectors: Accuracy: 0.84 ; F1 score: 0.8400465970399357
Epoch 1/1
NN, word-embeddings: Accuracy: 0.5032 ; F1 score: 0.5042698963463564


In [50]:
print(type(xtrain_count))
print(type(xtrain_count_and_tfidf.tocsr()))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>


In [64]:
# Shallow Neural Network on Count Vectors and Word Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf.tocsr(), train_y, xvalid_count_and_tfidf.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Word Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Count Vectors and Ngram Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_ngram.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_ngram.tocsr(), train_y, xvalid_count_and_tfidf_ngram.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Count Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_ngram_chars.tocsr(), train_y, xvalid_count_and_tfidf_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
classifier = create_model_architecture(xtrain_tfidf_and_ngram.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_tfidf_and_ngram.tocsr(), train_y, xvalid_tfidf_and_ngram.tocsr(), is_neural_net=True)
print("NN, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Word Level TF IDF Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_tfidf_and_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_tfidf_and_ngram_chars.tocsr(), train_y, xvalid_tfidf_and_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_tfidf_ngram_and_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_tfidf_ngram_and_ngram_chars.tocsr(), train_y, xvalid_tfidf_ngram_and_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
NN, Count Vectors and Word Level TF IDF Vectors: Accuracy: 0.8712 ; F1 score: 0.8712457563489141
Epoch 1/1
NN, Count Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8748 ; F1 score: 0.8748049078596316
Epoch 1/1
NN, Count Vectors and Character Level TF IDF Vectors: Accuracy: 0.874 ; F1 score: 0.8741098243549879
Epoch 1/1
NN, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8892 ; F1 score: 0.8892025005317878
Epoch 1/1
NN, Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.878 ; F1 score: 0.8779907270727072
Epoch 1/1
NN, Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8764 ; F1 score: 0.8764379411938494


In [65]:
# Shallow Neural Network on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_and_ngram.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_and_ngram.tocsr(), train_y, xvalid_count_and_tfidf_and_ngram.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_and_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_and_ngram_chars.tocsr(), train_y, xvalid_count_and_tfidf_and_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_ngram_and_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_ngram_and_ngram_chars.tocsr(), train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

# Shallow Neural Network on Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_ngram_and_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_ngram_and_ngram_chars.tocsr(), train_y, xvalid_count_and_tfidf_ngram_and_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
NN, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors: Accuracy: 0.8772 ; F1 score: 0.8772027703719893
Epoch 1/1
NN, Count Vectors and Word Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8688 ; F1 score: 0.8689453679955681
Epoch 1/1
NN, Count Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8776 ; F1 score: 0.877749451699539
Epoch 1/1
NN, Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.8828 ; F1 score: 0.8827963424699036


In [63]:
# Shallow Neural Network on Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors
classifier = create_model_architecture(xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars.tocsr().shape[1])
accuracy, f1_score = train_model(classifier, xtrain_count_and_tfidf_and_tfidf_ngram_and_ngram_chars.tocsr(), train_y, xvalid_count_and_tfidf_and_tfidf_ngram_and_ngram_chars.tocsr(), is_neural_net=True)
print("NN, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
NN, Count Vectors and Word Level TF IDF Vectors and Ngram Level TF IDF Vectors and Character Level TF IDF Vectors: Accuracy: 0.878 ; F1 score: 0.8781981353383459


In [66]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy, f1_score = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("CNN, Word Embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
CNN, Word Embeddings: Accuracy: 0.8228 ; F1 score: 0.8228339589092657


In [67]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy, f1_score = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-LSTM, Word Embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
RNN-LSTM, Word Embeddings: Accuracy: 0.7976 ; F1 score: 0.7976062200209485


In [68]:
def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_gru()
accuracy, f1_score = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-GRU, Word Embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
RNN-GRU, Word Embeddings: Accuracy: 0.7652 ; F1 score: 0.7724660856614499


In [69]:
def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_bidirectional_rnn()
accuracy, f1_score = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-Bidirectional, Word Embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
RNN-Bidirectional, Word Embeddings: Accuracy: 0.792 ; F1 score: 0.79248


In [70]:
def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rcnn()
accuracy, f1_score = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RCNN, Word Embeddings: Accuracy:", accuracy, "; F1 score:", f1_score)

Epoch 1/1
RCNN, Word Embeddings: Accuracy: 0.8276 ; F1 score: 0.8276584451606819
