In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer as Tokenizer_tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.python.client import device_lib
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from nltk.tokenize import RegexpTokenizer
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import re

Google Word2Vec Embedding Trained File:
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?pli=1&resourcekey=0-wjGZdNAUop6WykTtMip30g

In [2]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 9536113936945933956
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14357954560
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 9337607214970451033
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

In [4]:
columns = ['Tweet ID', 'Entity', 'Sentiment', 'Tweet Content']
df = pd.read_csv('Train.csv', names = columns)
df.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df.drop(['Tweet ID', 'Entity'], axis = 1, inplace = True)
df.tail()

Unnamed: 0,Sentiment,Tweet Content
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...
74681,Positive,Just like the windows partition of my Mac is l...


In [6]:
def show_details(dataset):
    missed_values = dataset.isnull().sum()
    duplicated_values = dataset.duplicated().sum()
    info_frame = pd.DataFrame({'Missed_Values' : missed_values ,
                              'Duplicated values' :duplicated_values
                               })
    return info_frame.T

show_details(df)

Unnamed: 0,Sentiment,Tweet Content
Missed_Values,0,686
Duplicated values,4909,4909


In [7]:
df.drop_duplicates(inplace = True)
df.dropna(inplace = True)
show_details(df)

Unnamed: 0,Sentiment,Tweet Content
Missed_Values,0,0
Duplicated values,0,0


In [8]:
def text_cleaner(txt) :
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # EMOTICONS
                           u"\U0001F300-\U0001F5FF"  # SYMBOLS
                           u"\U0001F680-\U0001F6FF"  # TRANSPORT
                           u"\U0001F1E0-\U0001F1FF"  # FLAGS
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r'', txt)
    text = re.sub(r"won\'t", "would not", text)
    text = re.sub(r"im", "i am", text)
    text = re.sub(r"Im", "I am", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"shouldn\'t", "should not", text)
    text = re.sub(r"needn\'t", "need not", text)
    text = re.sub(r"hasn\'t", "has not", text)
    text = re.sub(r"haven\'t", "have not", text)
    text = re.sub(r"weren\'t", "were not", text)
    text = re.sub(r"mightn\'t", "might not", text)
    text = re.sub(r"didn\'t", "did not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\!\?\.\@]',' ' , text)
    text = re.sub(r'[!]+' , '!' , text)
    text = re.sub(r'[?]+' , '?' , text)
    text = re.sub(r'[.]+' , '.' , text)
    text = re.sub(r'[@]+' , '@' , text)
    text = re.sub(r'unk' , ' ' , text)
    text = re.sub('\n', '', text)
    text = text.lower()
    text = re.sub(r'[ ]+' , ' ' , text)

    return text

In [9]:
df['Tweet Content'] = df['Tweet Content'].apply(lambda x : text_cleaner(x))
df

Unnamed: 0,Sentiment,Tweet Content
0,Positive,i am getting on borderlands and i will murder ...
1,Positive,i am coming to the borders and i will kill you...
2,Positive,i am getting on borderlands and i will kill yo...
3,Positive,i am coming on borderlands and i will murder y...
4,Positive,i am getting on borderlands 2 and i will murde...
...,...,...
74677,Positive,just realized that the windows partition of my...
74678,Positive,just realized that my mac window partition is ...
74679,Positive,just realized the windows partition of my mac ...
74680,Positive,just realized between the windows partition of...


In [10]:
df['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [11]:
df['Sentiment'].replace({'Positive' : 1 ,  'Negative' : 0 ,'Neutral': 2 , 'Irrelevant' : 2}, inplace = True)
df.head()

Unnamed: 0,Sentiment,Tweet Content
0,1,i am getting on borderlands and i will murder ...
1,1,i am coming to the borders and i will kill you...
2,1,i am getting on borderlands and i will kill yo...
3,1,i am coming on borderlands and i will murder y...
4,1,i am getting on borderlands 2 and i will murde...


In [12]:
Tokenizer = RegexpTokenizer(r'\w+')

df['Tweet Content']= df['Tweet Content'].apply(lambda x : Tokenizer.tokenize(x))
df.tail()

Unnamed: 0,Sentiment,Tweet Content
74677,1,"[just, realized, that, the, windows, partition..."
74678,1,"[just, realized, that, my, mac, window, partit..."
74679,1,"[just, realized, the, windows, partition, of, ..."
74680,1,"[just, realized, between, the, windows, partit..."
74681,1,"[just, like, the, windows, partition, of, my, ..."


In [13]:
nltk.download('stopwords')
stopwords_list = stopwords.words('english')

# nltk.download('wordnet')
# stopwords_list = wordnet.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# lemmatizer = WordNetLemmatizer()
Stemmer = PorterStemmer()

def stopwords_cleaner(text):
    # word = [lemmatizer.lemmatize(letter) for letter in text if letter not in stopwords_list]
    word = [Stemmer.stem(letter) for letter in text if letter not in stopwords_list]
    sentence = ' '.join(word)

    return sentence

df['Tweet Content'] = df['Tweet Content'].apply(lambda x : stopwords_cleaner(x))
df.head()

Unnamed: 0,Sentiment,Tweet Content
0,1,get borderland murder
1,1,come border kill
2,1,get borderland kill
3,1,come borderland murder
4,1,get borderland 2 murder


In [15]:
MAX_LEN = 300
BATCH = 16
HIDDEN = 128
OUTPUT = 3

In [16]:
x_train, x_val, y_train, y_val = train_test_split(df['Tweet Content'], df['Sentiment'], test_size = 0.2, random_state = 42)

In [17]:
Tokenizer = Tokenizer_tf()
Tokenizer.fit_on_texts(df['Tweet Content'].values.tolist())

x_train = Tokenizer.texts_to_sequences(x_train)
x_val = Tokenizer.texts_to_sequences(x_val)

In [18]:
x_train = pad_sequences(x_train, maxlen = MAX_LEN)
x_val = pad_sequences(x_val, maxlen = MAX_LEN)

In [20]:
embedding_path = '/content/drive/MyDrive/Colab Notebooks/Data/Embeddings/glove.6B.300d.txt'
embedding_dim = 300

embedding_matrix = {}
with open(embedding_path, 'r', encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        embedding_matrix[word] = vector

In [21]:
vocab_size = len(embedding_matrix)
embedding_layer = tf.keras.layers.Embedding(
    input_dim = vocab_size,
    output_dim = embedding_dim,
    weights = [np.array(list(embedding_matrix.values()))],
    input_length = MAX_LEN,
    trainable = False)

In [25]:
model = Sequential()

model.add(embedding_layer)
model.add(SimpleRNN(128, activation = 'tanh', use_bias = True))
model.add(Dense(16, activation = 'relu', use_bias = True))
model.add(Dense(16, activation = 'relu', use_bias = True))
model.add(Dense(3, activation = 'softmax'))

ALFA = 0.0005

adam_optimizer = Adam(learning_rate = ALFA)

model.compile(optimizer = adam_optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          120000000 
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               54912     
                                                                 
 dense_3 (Dense)             (None, 16)                2064      
                                                                 
 dense_4 (Dense)             (None, 16)                272       
                                                                 
 dense_5 (Dense)             (None, 3)                 51        
                                                                 
Total params: 120057299 (457.98 MB)
Trainable params: 57299 (223.82 KB)
Non-trainable params: 120000000 (457.76 MB)
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, validation_data = (x_val, y_val), epochs = 10, batch_size = BATCH)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 722/3489 [=====>........................] - ETA: 10:55 - loss: 0.4474 - accuracy: 0.8299