In [42]:
import re
import os 
import sys
from pathlib import Path 
import pickle 

import pandas as pd
import numpy as np

import string
from string import punctuation
from collections import Counter

from google.colab import drive
drive.mount('/content/drive/')
from drive.MyDrive.sentiment.preprocess import * 

import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [12]:
!ls drive/MyDrive/sentiment/

data	    preprocess.py  sentiment_analysis_Welsh.ipynb
lstm.ipynb  __pycache__    WelshSentiment.ipynb


In [48]:
X_data, labels = open_text('train-v2.tsv')
y_train = np.array(labels)
X_data_test, labels_test = open_text('test.tsv')
y_test = np.array(labels_test)

In [49]:
def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation.replace("@","")))

In [50]:
def tokenize(text, delimiter = " "):
    return text.split(" ")

In [51]:
def remove_stopwords(word_list, stopwords_list  = ["@user","{url}"]):
    temp_word_list = []
    for w in word_list: 
        if not (bool(re.match("(^@+\d*\w*\d*)", w)) or 
                bool(re.match("(^\d+[\s]*\w*$)", w)) or 
                bool(re.match("(^\d*[\S]*\w*\d+$)", w))):
            temp_word_list.append(w)
    word_list = [w.lower() for w in temp_word_list if not w.lower() in stopwords_list] 
    return word_list

In [52]:
def process_corpora(corpora_list): 
    processed_text = []
    global_word_list = []
    for text in corpora_list: 
        text = remove_punc(text)
        word_list = tokenize(text)
        word_list = remove_stopwords(word_list)
        if "" in word_list:
            word_list.remove("")
        processed_text.append(word_list)
        global_word_list += word_list
    return processed_text, global_word_list

In [53]:
processed_text, global_word_list = process_corpora(X_data)
processed_text_test, _ = process_corpora(X_data_test)

In [54]:
len(global_word_list)

1075766

In [55]:
len(set(global_word_list))

72210

In [56]:
global_word_list[:2]

['a', 'sicrhau']

In [57]:
def count_words(word_list): 
    return Counter(word_list)

In [58]:
count_dict = Counter(global_word_list)

In [59]:
unique_word_list_train = []
word_frequency_train = [] 
for k, v in count_dict.items():
    unique_word_list_train.append(k)
    word_frequency_train.append(v)   

In [60]:
sorted_word_frequency, sorted_unique_word_list = zip(*sorted(zip(word_frequency_train, unique_word_list_train),reverse= False))

In [61]:
# list(np.array(sorted_word_frequency)<2)

cleaned_unique_word_list = [d for (d, remove) in zip(sorted_unique_word_list, list(np.array(sorted_word_frequency)<5)) if not remove]
cleaned_word_frequency = [d for (d, remove) in zip(sorted_word_frequency, list(np.array(sorted_word_frequency)<5)) if not remove]


In [62]:
print(len(cleaned_word_frequency))
print(len(cleaned_unique_word_list))

12199
12199


In [63]:
index_dict = {}
index_dict["[UKN]"] = 1 
index_dict["[PAD]"] = 0
counter = 2
for w in cleaned_unique_word_list: 
    index_dict[w] = counter 
    counter += 1 
    
# find the maximum number of the sequence 


In [64]:
maximum_val = 0 
for i in range(len(processed_text)):
    if maximum_val < len(processed_text[i]):
        maximum_val = len(processed_text[i])
print(maximum_val)

37


In [65]:
def pad_trim_seq(sequence, maximum_val):
    if len(sequence)> maximum_val: 
        return sequence[:maximum_val]
    elif len(sequence)< maximum_val:
        for _ in range(maximum_val - len(sequence)): 
            sequence.append("[PAD]")
        return sequence 
    else: 
        return sequence 
    

In [66]:
max_input_len = 40 

train_padded=[]
for i in range(len(processed_text)):
    train_padded.append(pad_trim_seq(processed_text[i], max_input_len))
    
test_padded=[]
for i in range(len(processed_text_test)):
    test_padded.append(pad_trim_seq(processed_text_test[i], max_input_len))

In [87]:
print("Print out some sample tokens after padding and trimming:")
print(test_padded[7][15:30])

Print out some sample tokens after padding and trimming:
['ti', 'gyda', 'reading', 'week', 'wthnos', 'nesaxxx', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [68]:
def encode(sequence, dictionary):
    encoded_list = []
    for i in range(len(sequence)):
        try:
            encoded_list.append(dictionary[sequence[i]])
        except:
            encoded_list.append(dictionary["[UKN]"])
    return encoded_list

In [69]:
encoded_train=[]
for i in range(len(train_padded)):
    encoded_train.append(encode(train_padded[i], index_dict))
    
encoded_test=[]
for i in range(len(test_padded)):
    encoded_test.append(encode(test_padded[i], index_dict))

In [70]:
X_train_encoded = np.array(encoded_train)
X_test_encoded = np.array(encoded_test)

In [72]:
words_number = len(index_dict)
words_number

12201

## Defining a simple LSTM model

In [73]:
tf.random.set_seed(11)

words_number = len(index_dict)
embedding_vecor_length = 32

model = Sequential()
model.add(Embedding(words_number, embedding_vecor_length, input_length=max_input_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(32, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train_encoded, y_train, epochs=10, batch_size=64)


scores = model.evaluate(X_test_encoded, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 40, 32)            390432    
                                                                 
 lstm_3 (LSTM)               (None, 64)                24832     
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 417,377
Trainable params: 417,377
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 79.42%


In [74]:
scores = model.evaluate(X_test_encoded, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 79.42%
