In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Embedding
from tensorflow.keras.layers import SpatialDropout1D, Conv1D, Bidirectional, LSTM, Dense, Input, Dropout, GlobalMaxPooling1D
from keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy



In [None]:
df = pd.read_csv(r'/content/gdrive/MyDrive/Final_Dataset.tsv',delimiter='\t')

In [None]:
print(df.head())

In [None]:
positive_reviews = df[df['Label'] > 3]


positive_reviews

In [None]:
negative_reviews = df[df['Label'] < 3]
negative_reviews


In [None]:
data = pd.concat([positive_reviews, negative_reviews], ignore_index = True)
data.head()

In [None]:
df_train,df_test=train_test_split(data,test_size=0.3)

In [None]:
MAX_VOCAB_SIZE=250000
tokenizer=Tokenizer(num_words=MAX_VOCAB_SIZE,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(df_train['Text'])
sequences_train = tokenizer.texts_to_sequences(df_train['Text'])
sequences_test = tokenizer.texts_to_sequences(df_test['Text'])

In [None]:
word2idx=tokenizer.word_index
V=len(word2idx)
V

In [None]:
data_train=pad_sequences(sequences_train)
T=data_train.shape[1]
T

In [None]:
data_test=pad_sequences(sequences_test)


In [None]:
import gensim
Embedding_dim=300
file='/content/gdrive/MyDrive/tweet_cbow_300/tweets_cbow_300'
!iconv -f ISO-8859-1 -t UTF-8 /content/gdrive/MyDrive/tweet_cbow_300/tweets_cbow_300> /content/gdrive/MyDrive/tweets_utf8.txt
file_utf8 = '/content/gdrive/MyDrive/tweets_utf8.txt'
# Initialize an empty embeddings index dictionary
EMBEDDINGS_MATRIX = np.zeros ((V+1, Embedding_dim)) #EMBEDDINGS MATRIX-11
model = gensim.models.Word2Vec.load(file)
for word, i in word2idx.items():
  embedding_vector = None
  if word != "<OOV>":
    if word in model.wv:
      embedding_vector = model.wv[word]
EMBEDDINGS_MATRIX[i] = embedding_vector


In [None]:
EMBEDDINGS_MATRIX = np.nan_to_num(EMBEDDINGS_MATRIX)

In [None]:
#Convlution with BI LSTM
model = Sequential()
embedding_layer = Embedding(V+1, 
                            300, 
                            weights = [EMBEDDINGS_MATRIX], 
                            input_length = T , 
                            trainable=False)
model.add(embedding_layer)
model.add(Conv1D(filters=300, kernel_size=2, activation='relu'))
model.add(Bidirectional(LSTM(128, dropout=0.2, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))


In [None]:
model.compile(optimizer = Adam(learning_rate=0.0001), 
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics = ['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
print(model.summary())

In [None]:
history = model.fit(data_train, df_train['Label'], batch_size = 128, epochs=5, validation_data=(data_test,df_test['Label']),verbose=1, callbacks=[es])
#history = model.fit(data_train, df_train['Label'], validation_split=0.15, batch_size = 124, epochs=5, verbose=1, callbacks=[es])



In [None]:
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1,21)


In [None]:
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()