In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import nltk
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import missingno as msno
from collections import Counter
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### General Configuration

In [116]:
Batch_size = 32
vocab_size = len(vocab) +1
embed_size = 100
hidden_size = 128
output_size = 2 
learning_rate = 0.001
n_epochs = 30
n_epochs = 30

In [117]:
nltk.download('stopword')
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index


In [146]:
cleaned_text[1]

'esi low bk real possibl'

In [118]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [119]:
data['sentiment_numeric'] = data['Sentiment'].map({'positive': 1, 'neutral': 1, 'negative': 0})


### Text Cleaning

In [120]:
class TextPreprocessor:
    def __init__(self, stop_words):
        self.stop_words = stop_words
        self.stemmer = PorterStemmer()

    def to_lowercase(self, text: str) -> str:
        return text.lower()

    def remove_urls(self, text: str) -> str:
        url_pattern = re.compile(r'http\S+|www\S+')
        return url_pattern.sub('', text)
    
    
    def remove_non_english_characters(self, text: str) -> str:
        english_char_pattern = re.compile(r'[^a-zA-Z\s]')
        return english_char_pattern.sub('', text)
    
    def remove_special_characters(self, text: str) -> str:
        special_char_pattern = re.compile(r'[^\w\s\u0600-\u06FF]')
        return special_char_pattern.sub('', text)

    def remove_punctuation(self, text: str) -> str:
        punctuation_pattern = re.compile(r'[^\w\s]')
        return punctuation_pattern.sub('', text)
    
    def remove_numbers(self, text: str) -> str:
        number_pattern = re.compile(r'\d+')
        return number_pattern.sub('', text)


    def remove_stop_words(self, text: str) -> str:
        return " ".join([word for word in text.split() if word not in self.stop_words])

    def remove_consecutive_duplicates(self , text: str) -> str:
        return re.sub(r'\b(\w+)(\s+\1)+\b', r'\1', text)
    
    
    def remove_multiple_spaces(self, text: str) -> str:
        space_pattern = re.compile(r'\s+')
        return space_pattern.sub(' ', text)
    
    def stem_text(self, text: str) -> str:
        return " ".join([self.stemmer.stem(word) for word in text.split()])

In [121]:
preprocessor = TextPreprocessor(stopwords)

In [122]:
cleaned_text = []
for sample in data['Sentence']:
    sample = preprocessor.to_lowercase(sample)
    sample = preprocessor.remove_urls(sample)
    sample = preprocessor.remove_non_english_characters(sample)
    sample = preprocessor.remove_special_characters(sample)
    sample = preprocessor.remove_punctuation(sample)
    sample = preprocessor.remove_numbers(sample)
    sample = preprocessor.remove_stop_words(sample)
    sample= preprocessor.remove_consecutive_duplicates(sample)
    sample = preprocessor.remove_stop_words(sample)
    sample = preprocessor.remove_multiple_spaces(sample)
    sample = preprocessor.stem_text(sample)
    cleaned_text.append(sample)

In [123]:
cleaned_text[50],data['Sentence'][50]

('six breweri record percent growth domest beer sale last year million liter million liter sold',
 'The six breweries recorded a 5.2 percent growth in domestic beer sales last year to 270.21 million liters , from 256.88 million liters sold in 2005 .')

### Building Vocabulary

In [124]:
tok = Tokenizer(oov_token='UNK')
tok.fit_on_texts(cleaned_text)
vocab = tok.word_index

In [125]:
vocab

{'UNK': 1,
 'eur': 2,
 'compani': 3,
 'mn': 4,
 'oper': 5,
 'sale': 6,
 'profit': 7,
 'finnish': 8,
 'share': 9,
 'said': 10,
 'net': 11,
 'million': 12,
 'year': 13,
 'period': 14,
 'market': 15,
 'group': 16,
 'mln': 17,
 'finland': 18,
 'quarter': 19,
 'total': 20,
 'product': 21,
 'servic': 22,
 'new': 23,
 'euro': 24,
 'busi': 25,
 'oyj': 26,
 'first': 27,
 'loss': 28,
 'increas': 29,
 'compar': 30,
 'also': 31,
 'report': 32,
 'today': 33,
 'develop': 34,
 'price': 35,
 'includ': 36,
 'helsinki': 37,
 'contract': 38,
 'correspond': 39,
 'bank': 40,
 'invest': 41,
 'manufactur': 42,
 'unit': 43,
 'expect': 44,
 'solut': 45,
 'per': 46,
 'result': 47,
 'stock': 48,
 'decreas': 49,
 'percent': 50,
 'corpor': 51,
 'order': 52,
 'industri': 53,
 'manag': 54,
 'provid': 55,
 'system': 56,
 'financi': 57,
 'plant': 58,
 'custom': 59,
 'technolog': 60,
 'accord': 61,
 'hel': 62,
 'nokia': 63,
 'project': 64,
 'build': 65,
 'capit': 66,
 'mobil': 67,
 'valu': 68,
 'well': 69,
 'use': 70,


In [126]:
sequences = tok.texts_to_sequences(cleaned_text)

### Padding

In [127]:
padded_sequences = pad_sequences(sequences, maxlen=36, padding='post')

In [128]:
padded_sequences[0]

array([2856,   60, 2310, 1080, 1541,   45,   55,  297,  107, 1542,   60,
        130,  844,  297, 1543, 2311,  508,   23,  223,  393,  248,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])

###  Data Preparing

In [129]:
x = padded_sequences

In [130]:
y = data['sentiment_numeric'] 

In [131]:
x = torch.tensor(x)
y = torch.tensor(y)

In [132]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=42)

In [133]:
print(f'The shape of X train{x_train.shape}')
print(f'The shape of Y train{y_train.shape}')
print(f'The shape of X test{x_test.shape}')
print(f'The shape of Y test{y_test.shape}')

The shape of X traintorch.Size([4673, 36])
The shape of Y traintorch.Size([4673])
The shape of X testtorch.Size([1169, 36])
The shape of Y testtorch.Size([1169])


In [134]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [135]:
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)

In [136]:
train_loader = DataLoader(train_dataset, batch_size=Batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=Batch_size, shuffle=False)

### Model Building 

In [137]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, n_layers=2, bidirectional=True):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.rnn = nn.LSTM(embed_size, hidden_size, n_layers, bidirectional=bidirectional, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.dropout(x[:, -1, :])
        x = self.fc(x)
        return x

In [138]:
model = SentimentRNN(vocab_size, embed_size, hidden_size, output_size)

In [139]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [140]:
for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for sentences, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(sentences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    avg_loss = epoch_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss}, Accuracy: {accuracy}')


Epoch 1/30, Loss: 0.4287048636042342, Accuracy: 0.853413224909052
Epoch 2/30, Loss: 0.43034961404038125, Accuracy: 0.853413224909052
Epoch 3/30, Loss: 0.41900701618113484, Accuracy: 0.853413224909052
Epoch 4/30, Loss: 0.4139324803108058, Accuracy: 0.853413224909052
Epoch 5/30, Loss: 0.42820571606256524, Accuracy: 0.853413224909052
Epoch 6/30, Loss: 0.41908180546395635, Accuracy: 0.853413224909052
Epoch 7/30, Loss: 0.41765376646705227, Accuracy: 0.853413224909052
Epoch 8/30, Loss: 0.4173009353024619, Accuracy: 0.853413224909052
Epoch 9/30, Loss: 0.41850297619290905, Accuracy: 0.853413224909052
Epoch 10/30, Loss: 0.4081646597811154, Accuracy: 0.853413224909052
Epoch 11/30, Loss: 0.3924109770529935, Accuracy: 0.853413224909052
Epoch 12/30, Loss: 0.4062117485164785, Accuracy: 0.853413224909052
Epoch 13/30, Loss: 0.4144514066224195, Accuracy: 0.8527712390327413
Epoch 14/30, Loss: 0.40645869735146867, Accuracy: 0.853413224909052
Epoch 15/30, Loss: 0.405981052084034, Accuracy: 0.8534132249090

In [141]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for sentences, labels in test_loader:
        outputs = model(sentences)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8502994011976048
