In [None]:
# Matplot
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re    
import os
from collections import Counter
import logging
import time    
import pickle
import itertools

In [None]:
import pandas as pd
#DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
path='Dataset.csv'

df = pd.read_csv(path, encoding =DATASET_ENCODING )
df.head()

In [None]:
df.drop(['ids','date','flag','user'],axis = 1,inplace = True)
df.head()

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [None]:
neg = ['ain','no','not','t','don',"don't",'aren',"aren't",'couldn',"couldn't",'didn',"didn't",
 'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'amn',"amn't",'mightn',"mightn't",
 'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',
 "weren't",'won',"won't",'wouldn',"wouldn't"]

In [None]:
stop_words = [item for item in stop_words if item not in neg]

In [None]:
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(text):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if token in neg :
                #Replace negative words with not
                tokens.append("not")
                continue
            #fetching the stemmer of the word
            token = stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [None]:
%%time
df['TweetText'] = df['text'].apply(lambda x: preprocess(x))
df.head()

In [None]:
TRAIN_SIZE = 0.9
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

In [None]:
documents = [_text.split() for _text in df_train.TweetText] 

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(size=300, 
                                            window=7,
                                             workers=8, 
                                            min_count=10
                                            )

In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=32)

In [None]:
#saving the model
w2v_model.save('w2v.model')

In [None]:
#importing the model
w2v_model=gensim.models.Word2Vec.load('w2v.model')

In [None]:
w2v_model['like'] 

In [None]:
w2v_model.most_similar("love")

In [None]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.TweetText)

In [None]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.TweetText), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.TweetText), maxlen=300)

In [None]:
labels = df_train.target.unique().tolist()
labels.append(NEUTRAL)

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())

y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]

In [None]:
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_MIN_COUNT = 10
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix],
                            input_length=300, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])        

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [None]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=8,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
%%time
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()

In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE    

In [None]:
def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at} 

In [None]:
predict("I love the music")

In [None]:
predict("i don't know what i'm doing")

In [None]:
predict("I hate the rain")

In [None]:
%%time
y_pred_1d = []
y_test_1d = list(df_test.target)
scores = model.predict(x_test, verbose=1, batch_size=8000)
y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in scores]

In [None]:
%%time
cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(12,12))
plot_confusion_matrix(cnf_matrix, classes=df_train.target.unique(), title="Confusion matrix")
plt.show()

In [None]:
print(classification_report(y_test_1d, y_pred_1d))

In [None]:
accuracy_score(y_test_1d, y_pred_1d)

In [None]:
model.save(KERAS_MODEL)
w2v_model.save(WORD2VEC_MODEL)
pickle.dump(tokenizer, open(TOKENIZER_MODEL, "wb"), protocol=0)
pickle.dump(encoder, open(ENCODER_MODEL, "wb"), protocol=0)