######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">COVID Tweets: Deep Neural Network Models

- for cleaning COVID_19 tweets: Covid_tweets_clean_wordcloud.ipynb
    
######  <span style="font-family: Arial; font-weight:bold;font-size:1.0em;color:#35c33a">Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

######  <span style="font-family: Arial; font-weight:bold;font-size:1.0em;color:#35c33a">Load cleaned tweets (Covid_tweets_clean_wordcloud.ipynb)

In [2]:
train_cleaned = pd.read_csv('/Users/preethamvignesh/Desktop/Work/ML_EIT/Data/corona_nlpdata/covidtweets_train_cleaned.csv')
test_cleaned = pd.read_csv('/Users/preethamvignesh/Desktop/Work/ML_EIT/Data/corona_nlpdata/covidtweets_test_cleaned.csv')
display(train_cleaned.head(), test_cleaned.head())

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
0,London,2020-03-16,menyrbie chrisitv,Neutral
1,UK,2020-03-16,advice talk neighbour family exchange phone nu...,Positive
2,Vagabonds,2020-03-16,coronavirus australia woolworth give elderly d...,Positive
3,,2020-03-16,food stock one empty please dont panic enough ...,Positive
4,,2020-03-16,ready go supermarket outbreak im paranoid food...,Extremely Negative


Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
0,NYC,2020-03-02,trending new yorkers encounter empty supermark...,Extremely Negative
1,"Seattle, WA",2020-03-02,couldnt find hand sanitizer fred meyer turned ...,Positive
2,,2020-03-02,find protect loved one coronavirus,Extremely Positive
3,Chicagoland,2020-03-02,panic buying hit newyork city anxious shopper ...,Negative
4,"Melbourne, Victoria",2020-03-03,toiletpaper dunnypaper coronavirus coronavirus...,Neutral


In [3]:
#Encode the target:
y_train = pd.get_dummies(train_cleaned.Sentiment).values
y_test = pd.get_dummies(test_cleaned.Sentiment).values
y_train

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Modeling Deep Neural Network with Keras

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Count Vectorizer Models:
    
- Prepare the data with CountVectorizer method:

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(train_cleaned.OriginalTweet.values)

X_train = vectorizer.transform(train_cleaned.OriginalTweet.values)
X_test = vectorizer.transform(test_cleaned.OriginalTweet.values)

X_train = X_train.toarray()
X_test = X_test.toarray()

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((41134, 52500), (3798, 52500), (41134, 5), (3798, 5))

######  <span style="font-family: Arial; font-weight:bold;font-size:1.0em;color:#c3b235">Simple One Layer Model:

In [7]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

opti = Adam(lr = 0.01)

model_simple_count = Sequential()
model_simple_count.add(Dense(16, input_dim = X_train.shape[1], activation = 'relu'))
model_simple_count.add(Dense(5, activation = 'softmax'))

In [8]:
model_simple_count.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
model_simple_count.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                840016    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 85        
Total params: 840,101
Trainable params: 840,101
Non-trainable params: 0
_________________________________________________________________


In [9]:
history_simple_count = model_simple_count.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [10]:
#Save models and history
# model_simple_count.save('/Users/preethamvignesh/Downloads/Simple_model_Count.h5')
# np.save('/Users/preethamvignesh/Downloads/history_simple_count.npy',history_simple_count.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Mutli layers model:

In [12]:
opti = Adam(lr = 0.01)

model_multi_count = Sequential()
model_multi_count.add(Dense(64, input_dim = X_train.shape[1], activation = 'relu'))
model_multi_count.add(Dense(32, activation = 'relu'))
model_multi_count.add(Dense(16, activation = 'relu'))
model_multi_count.add(Dense(5, activation = 'softmax'))

In [13]:
model_multi_count.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
model_multi_count.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                3360064   
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_8 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 85        
Total params: 3,362,757
Trainable params: 3,362,757
Non-trainable params: 0
_________________________________________________________________


In [14]:
history_multi_count = model_multi_count.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [15]:
# #Save models and history
# model_multi_count.save('/Users/preethamvignesh/Downloads/model_multi_count.h5')
# np.save('/Users/preethamvignesh/Downloads//history_multi_count.npy',history_multi_count.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Embedding models:

In [17]:
train_cleaned['num_words'] = train_cleaned.OriginalTweet.apply(lambda x : len(x.split()))
max(train_cleaned['num_words'])

39

In [22]:
maxlen = 30

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Flatten, GlobalMaxPool1D, Conv1D

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_cleaned.OriginalTweet.values)

X_train = tokenizer.texts_to_sequences(train_cleaned.OriginalTweet.values)
X_test = tokenizer.texts_to_sequences(test_cleaned.OriginalTweet.values)

X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

vocab_size = len(tokenizer.word_index) + 1

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Single layer Model:

In [23]:
embedding_dim = 30
opti = Adam(lr = 0.01)

model_simple_embed = Sequential()
model_simple_embed.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model_simple_embed.add(Flatten())
model_simple_embed.add(Dense(16, activation = 'relu'))
model_simple_embed.add(Dense(5, activation = 'softmax'))

In [24]:
model_simple_embed.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
model_simple_embed.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 30)            1575780   
_________________________________________________________________
flatten (Flatten)            (None, 900)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                14416     
_________________________________________________________________
dense_11 (Dense)             (None, 5)                 85        
Total params: 1,590,281
Trainable params: 1,590,281
Non-trainable params: 0
_________________________________________________________________


In [25]:
history_simple_embed = model_simple_embed.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [26]:
#Save models and history
# model_simple_embed.save('/Users/preethamvignesh/Downloads/model_simple_embed.h5')
# np.save('/Users/preethamvignesh/Downloads/history_simple_embed.npy',history_simple_embed.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Multi layer Model:

In [27]:
embedding_dim = 30
opti = Adam(lr = 0.01)

multi_model_Embed = Sequential()
multi_model_Embed.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
multi_model_Embed.add(Flatten())
multi_model_Embed.add(Dense(64, activation = 'relu'))
multi_model_Embed.add(Dense(32, activation = 'relu'))
multi_model_Embed.add(Dense(16, activation = 'relu'))
multi_model_Embed.add(Dense(5, activation = 'softmax'))

In [28]:
multi_model_Embed.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
multi_model_Embed.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 30)            1575780   
_________________________________________________________________
flatten_1 (Flatten)          (None, 900)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 64)                57664     
_________________________________________________________________
dense_13 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_14 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_15 (Dense)             (None, 5)                 85        
Total params: 1,636,137
Trainable params: 1,636,137
Non-trainable params: 0
____________________________________________

In [29]:
history_multi_Embed = multi_model_Embed.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [30]:
#Save models and history
# multi_model_Embed.save('/Users/preethamvignesh/Downloads/multi_model_Embed.h5')
# np.save('/Users/preethamvignesh/Downloads/history_multi_Embed.npy',history_multi_Embed.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Glove Dictionary Models:

In [31]:
embeddings_index = {}
with open('/Users/preethamvignesh/Desktop/Work/ML_EIT/Data/corona_nlpdata/glove.6B.50d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [32]:
num_tokens = len(tokenizer.word_index) + 1
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 25072 words (27453 misses)


######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Simple Glove Model:

In [36]:
embedding_dim = 50
opti = Adam(lr=0.01)

model_simple_glove = Sequential()
model_simple_glove.add(Embedding(vocab_size, embedding_dim,input_length=maxlen, weights = [embedding_matrix], trainable = False))
model_simple_glove.add(Flatten())
model_simple_glove.add(Dense(16, activation = 'relu'))
model_simple_glove.add(Dense(5, activation = 'softmax'))

In [37]:
model_simple_glove.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
model_simple_glove.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 50)            2626300   
_________________________________________________________________
flatten_3 (Flatten)          (None, 1500)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 16)                24016     
_________________________________________________________________
dense_19 (Dense)             (None, 5)                 85        
Total params: 2,650,401
Trainable params: 24,101
Non-trainable params: 2,626,300
_________________________________________________________________


In [38]:
history_simple_glove = model_simple_glove.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [40]:
#Save models and history
# model_simple_glove.save('/Users/preethamvignesh/Downloads/model_simple_glove.h5')
# np.save('/Users/preethamvignesh/Downloads/history_simple_glove.npy',history_simple_glove.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Mutli layers Glove model:

In [44]:
embedding_dim = 50
opti = Adam(lr = 0.01)

model_multi_glove = Sequential()
model_multi_glove.add(Embedding(vocab_size, embedding_dim,input_length=maxlen, weights = [embedding_matrix], trainable = False))
model_multi_glove.add(Flatten())
model_multi_glove.add(Dense(64, activation = 'relu'))
model_multi_glove.add(Dense(32, activation = 'relu'))
model_multi_glove.add(Dense(16, activation = 'relu'))
model_multi_glove.add(Dense(5, activation = 'softmax'))

In [45]:
model_multi_glove.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model_multi_glove.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 30, 50)            2626300   
_________________________________________________________________
flatten_6 (Flatten)          (None, 1500)              0         
_________________________________________________________________
dense_28 (Dense)             (None, 64)                96064     
_________________________________________________________________
dense_29 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_30 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_31 (Dense)             (None, 5)                 85        
Total params: 2,725,057
Trainable params: 98,757
Non-trainable params: 2,626,300
______________________________________

In [46]:
history_Multi_glove = model_multi_glove.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [47]:
#Save models and history
# model_multi_glove.save('/Users/preethamvignesh/Downloads/model_multi_glove.h5')
# np.save('/Users/preethamvignesh/Downloads/history_Multi_glove.npy',history_Multi_glove.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Convolutional Neural Networks:

In [52]:
from tensorflow.keras.layers import Dropout

embedding_dim = 30
opti = Adam(lr=0.01)

model_Conv = Sequential()
model_Conv.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model_Conv.add(Dropout(0.2))
model_Conv.add(Conv1D(16, 3, activation='relu'))
model_Conv.add(GlobalMaxPool1D())
model_Conv.add(Dense(5, activation = 'softmax'))

In [53]:
model_Conv.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
model_Conv.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 30, 30)            1575780   
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 30)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 28, 16)            1456      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 16)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 5)                 85        
Total params: 1,577,321
Trainable params: 1,577,321
Non-trainable params: 0
_________________________________________________________________


In [54]:
history_Conv = model_Conv.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [56]:
# model_Conv.save('/Users/preethamvignesh/Downloads/NN_Models/model_Conv.h5')
# np.save('/Users/preethamvignesh/Downloads/history_Conv.npy',history_Conv.history)

######  <span style="font-family: Arial; font-weight:bold;font-size:1.25em;color:#c3b235">Convolutional with Glove dictionnary

In [57]:
embedding_dim = 50
opti = Adam(lr=0.01)

model_Conv_glove = Sequential()
model_Conv_glove.add(Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim,
                           weights = [embedding_matrix],
                           input_length=maxlen))
model_Conv_glove.add(Dropout(0.2))
model_Conv_glove.add(Conv1D(16, 3, activation='relu'))
model_Conv_glove.add(GlobalMaxPool1D())
model_Conv_glove.add(Dense(5, activation = 'softmax'))

In [58]:
model_Conv_glove.compile(loss = 'categorical_crossentropy', optimizer = opti, metrics = ['accuracy'])
model_Conv_glove.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 30, 50)            2626300   
_________________________________________________________________
dropout_3 (Dropout)          (None, 30, 50)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 28, 16)            2416      
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 16)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 5)                 85        
Total params: 2,628,801
Trainable params: 2,628,801
Non-trainable params: 0
_________________________________________________________________


In [59]:
history_Conv_glove = model_Conv_glove.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=16)

Epoch 1/2
Epoch 2/2


In [None]:
# model_Conv_glove.save('/Users/preethamvignesh/Downloads/model_Conv_glove.h5')
# np.save('/Users/spavot/Documents/Perso/Text classification & Visualization/Models/History/history_Conv_glove.npy',history_Conv_glove.history)


In [None]:
# import re
# import nltk
# import tensorflow as tf
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# from tensorflow import keras
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.layers import LSTM
# from tensorflow.keras.layers import Dropout
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.layers import Embedding, Flatten, GlobalMaxPool1D, Conv1D
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adam
# from sklearn.feature_extraction.text import TfidfVectorizer
# from wordcloud import WordCloud
# from sklearn.model_selection import RandomizedSearchCV
# from nltk.stem import WordNetLemmatizer 
# nltk.download('wordnet')