<a href="https://colab.research.google.com/github/KhosrojerdiA/NLP/blob/main/Twitter_Sentiment_Analysis_using_Spacy_and_ntlk_and_Word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import spacy

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
from collections import Counter
import time

In [None]:
text = """
Hi! My name is Spencer and I love doing videos related to tech implementation/strategy. If you like finance, tech, or real world
Q&A videos about Data Science, Data Engineering, and Data Analyst related topics, let me know in the comments down below! *psst*
If you made it this far in the video, why don't you subscribe? I upload videos every week on a Sunday ~7:00 PM EST. If you like
this type of content make sure to hit that like button! That really helps out with the growth of this channel. :)
"""

#Process Text

In [None]:
pattern = r'[^A-Za-z ]'
regex = re.compile(pattern)
result = regex.sub('', text)
result

'Hi My name is Spencer and I love doing videos related to tech implementationstrategy If you like finance tech or real worldQA videos about Data Science Data Engineering and Data Analyst related topics let me know in the comments down below psstIf you made it this far in the video why dont you subscribe I upload videos every week on a Sunday  PM EST If you likethis type of content make sure to hit that like button That really helps out with the growth of this channel '

In [None]:
# Load in the NLP model that you have chosen to downloaded; I have the large model.
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(result)

# Let's get each individual word as an element.
tokens = [token for token in doc]
tokens

[Hi,
 My,
 name,
 is,
 Spencer,
 and,
 I,
 love,
 doing,
 videos,
 related,
 to,
 tech,
 implementationstrategy,
 If,
 you,
 like,
 finance,
 tech,
 or,
 real,
 worldQA,
 videos,
 about,
 Data,
 Science,
 Data,
 Engineering,
 and,
 Data,
 Analyst,
 related,
 topics,
 let,
 me,
 know,
 in,
 the,
 comments,
 down,
 below,
 psstIf,
 you,
 made,
 it,
 this,
 far,
 in,
 the,
 video,
 why,
 do,
 nt,
 you,
 subscribe,
 I,
 upload,
 videos,
 every,
 week,
 on,
 a,
 Sunday,
  ,
 PM,
 EST,
 If,
 you,
 likethis,
 type,
 of,
 content,
 make,
 sure,
 to,
 hit,
 that,
 like,
 button,
 That,
 really,
 helps,
 out,
 with,
 the,
 growth,
 of,
 this,
 channel]

In [None]:
for t in tokens:
    print('Token is : ', t,'--- Is this a stop word? ', t.is_stop, '--- Lemmatized token is: ', t.lemma_)

# Store the lemmas without the words.
lemmas = [t.lemma_ for t in tokens if not t.is_stop]

Token is :  Hi --- Is this a stop word?  False --- Lemmatized token is:  hi
Token is :  My --- Is this a stop word?  True --- Lemmatized token is:  my
Token is :  name --- Is this a stop word?  True --- Lemmatized token is:  name
Token is :  is --- Is this a stop word?  True --- Lemmatized token is:  be
Token is :  Spencer --- Is this a stop word?  False --- Lemmatized token is:  Spencer
Token is :  and --- Is this a stop word?  True --- Lemmatized token is:  and
Token is :  I --- Is this a stop word?  True --- Lemmatized token is:  I
Token is :  love --- Is this a stop word?  False --- Lemmatized token is:  love
Token is :  doing --- Is this a stop word?  True --- Lemmatized token is:  do
Token is :  videos --- Is this a stop word?  False --- Lemmatized token is:  video
Token is :  related --- Is this a stop word?  False --- Lemmatized token is:  relate
Token is :  to --- Is this a stop word?  True --- Lemmatized token is:  to
Token is :  tech --- Is this a stop word?  False --- Lemma

#Twitter Data

In [None]:
# Reading in twitter data on sentiment. (NEGATIVE, POSITIVE for target)
# Already cleaned and preprocessed...
df = pd.read_csv('twitter_data.csv')
df = df.sample(frac=1).reset_index()
df = df.drop(['index'], axis = 1)
df

#Process Data

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

##Apply

In [None]:
df.text = df.text.apply(lambda x: preprocess(x)) # preprocessing the text data.

##Check

In [None]:
df.text[0]

#Split

In [None]:
# Split into train and test dataset.
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

#Word2vec

In [None]:
documents = [_text.split() for _text in df_train.text]

w2v_model = gensim.models.word2vec.Word2Vec(vector_size =300, # vector size
                                            window=7, # distance between current and predicted word within a sentence
                                            min_count=10, # ignores words with total frequency less than the parameter
                                            workers=8) # threads
w2v_model.build_vocab(documents)
words = w2v_model.wv.index_to_key
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=32)

In [None]:
w2v_model.wv.most_similar("like")


#[('horrible', 0.3705178201198578),
# ('creepy', 0.3669455945491791),
# ('disgusting', 0.3664887547492981),
# ('understand', 0.3367389440536499),
# ('awful', 0.33474934101104736),
# ('guilty', 0.33130747079849243),
# ('weird', 0.3161929249763489),
# ('tells', 0.30077123641967773),
# ('worse', 0.2977883815765381),
# ('interesting', 0.2937477231025696)]

In [None]:
w2v_model.wv.most_similar("comment")
#('link', 0.4997900128364563),
# ('commented', 0.48193472623825073),
# ('blogtv', 0.45630455017089844),
# ('comments', 0.4551774561405182),
# ('flickr', 0.4323364198207855),
# ('suscribe', 0.43144938349723816

#Tokenize Text & Create Embedding Layer.
using word2vec

In [None]:
#df = pd.read_csv('twitter_data.csv')
#df = df.sample(frac=1).reset_index()
#df = df.drop(['index'], axis = 1)
#df

# Split into train and test dataset.
#df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
%%time
tokenizer = Tokenizer() #from Keras
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

#Text to Sequence
New Train and Test

In [None]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=300)

In [None]:
print(len(x_train), len(x_test))

#Creating an embedding layer
that will act as an input layer for the neural network.

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)
# used in the future.
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

In [None]:
labels = df_train.target.unique().tolist()
labels

#['NEGATIVE', 'POSITIVE']

#Label encoding of y_train and y_test

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)
y_train (80000, 1)
y_test (20000, 1)

#Model

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) # This can be BERT which may have better performance.
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
%%time
history = model.fit(x_train,
                    y_train,
                    batch_size=1024,
                    epochs=8,
                    validation_split=0.1,
                    verbose=1,
                    callbacks = callbacks)

#Evaluate

In [None]:
score = model.evaluate(x_test, y_test, batch_size=32)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show() # Seems like the model may be overfitting. training loss << validation loss

#Predict on a sentence

In [None]:
def decode_sentiment(score):
    return 'NEGATIVE' if score < 0.5 else 'POSITIVE'

def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    score = model.predict([x_test])[0]
#     print(score)
    # Decode sentiment
    label = decode_sentiment(score[0])

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [None]:
predict("Leave a like on this video, comment, and subscribe for more!")

#{'label': 'POSITIVE',
# 'score': 0.6529418230056763,
# 'elapsed_time': 0.08769869804382324}