<a href="https://colab.research.google.com/github/GeorgeShishkanov/university_project/blob/main/Text_classification_with_LSTM_Shishkanov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd                                           
import numpy as np                                            
import matplotlib.pyplot as plt                               
import seaborn as sns                                          
import torch                                                  
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data.utils import get_tokenizer                
from torchtext import vocab
from sklearn.model_selection import train_test_split     
from sklearn.metrics import accuracy_score                    
from sklearn.preprocessing import LabelEncoder
from nltk import word_tokenize                                
from nltk.tokenize import WordPunctTokenizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import random                                                 
from tqdm.notebook import tqdm                                
import time                                                    
import os                                                      
import re

In [None]:
GLOVE_TEXT_PATH = '/content/glove.6B.100d.txt' 
EPOCH = 5 
SEED = 42

In [None]:
NUM_CLASSES = 5
MAX_VOCAB_SIZE = 250000
BATCH_SIZE = 64
debug = 0

In [None]:
test_data = pd.read_csv('/content/test.csv')
train_data = pd.read_csv('/content/train.csv')

In [None]:
train_data.dropna(inplace=True)

In [None]:
def remove_special_characters(text, remove_digits=True):
    text = re.sub('http://\S+|https://\S+|www.\S+', '', text)
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text.strip()

In [None]:
train_data.Text = train_data.Text.apply(remove_special_characters)
train_data.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,MeNyrbie Phil_Gahan Chrisitv and and,Neutral
1,1,advice Talk to your neighbours family to excha...,Positive
2,2,Coronavirus Australia Woolworths to give elder...,Positive
3,3,My food stock is not the only one which is emp...,Positive
4,4,Me ready to go at supermarket during the COVID...,Extremely Negative


In [None]:
test_data.Text = test_data.Text.apply(remove_special_characters)
test_data.head()

Unnamed: 0,id,Text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING New Yorkers encounter empty supermark...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldnt find hand sanitizer at Fred Mey...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Panic buying hits NewYork City as anxious shop...
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,toiletpaper dunnypaper coronavirus coronavirus...


In [None]:
sentimental_data = train_data[['Text', 'Sentiment']]
sentimental_data.columns=['Sentence', 'Label']
sentimental_data.head()

Unnamed: 0,Sentence,Label
0,MeNyrbie Phil_Gahan Chrisitv and and,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia Woolworths to give elder...,Positive
3,My food stock is not the only one which is emp...,Positive
4,Me ready to go at supermarket during the COVID...,Extremely Negative


In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_data.Text, train_data.Sentiment,
                                                  stratify=train_data.Sentiment, 
                                                  test_size=0.2,
                                                  random_state=SEED)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((32924,), (8231,), (32924,), (8231,))

In [None]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(list(X_train) + list(X_val) + list(test_data.Text))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_data.Text)

In [None]:
MAX_TEXT_LEN = 280

In [None]:
X_train = pad_sequences(X_train, maxlen=MAX_TEXT_LEN)
X_val = pad_sequences(X_val, maxlen=MAX_TEXT_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_TEXT_LEN)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_train

array([0, 1, 0, ..., 1, 2, 1])

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '/content/glove.6B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    nb_words = min(MAX_VOCAB_SIZE, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= MAX_VOCAB_SIZE: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
if debug:
    embedding_matrix = np.random.randn(MAX_VOCAB_SIZE,300)
else:
    embedding_matrix = load_glove(tokenizer.word_index)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
max_features, embed_size = embedding_matrix.shape

In [None]:
class BiLSTM(nn.Module):
    
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.hidden_size = 64
        drp = 0.1
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drp)
        self.out = nn.Linear(64, n_classes)


    def forward(self, x):
        #rint(x.size())
        h_embedding = self.embedding(x)
        #_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
        h_lstm, _ = self.lstm(h_embedding)
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [None]:
n_epochs = 3 
model = BiLSTM()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)

In [None]:
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)

In [None]:
train = torch.utils.data.TensorDataset(X_train, y_train)
valid = torch.utils.data.TensorDataset(X_val, y_val)

In [None]:
train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    start_time = time.time()
    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(X_val),len(le.classes_)))
    
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * BATCH_SIZE : (i+1) * BATCH_SIZE] = F.softmax(y_pred).cpu().numpy()
    
    # Check Accuracy
    val_accuracy = sum(torch.eq(torch.tensor(val_preds.argmax(axis=1)), y_val)) / len(y_val)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))

  val_preds[i * BATCH_SIZE : (i+1) * BATCH_SIZE] = F.softmax(y_pred).cpu().numpy()


Epoch 1/3 	 loss=81.8445 	 val_loss=68.4772  	 val_acc=0.5477  	 time=317.63s
Epoch 2/3 	 loss=63.9401 	 val_loss=61.8585  	 val_acc=0.5925  	 time=316.02s
Epoch 3/3 	 loss=54.8411 	 val_loss=56.8303  	 val_acc=0.6494  	 time=311.00s


In [None]:
sample_submission = pd.read_csv('/content/sample_submission.csv')

In [None]:
pred = model(X_test).detach()

In [None]:
pred = F.softmax(pred).cpu().numpy()

  pred = F.softmax(pred).cpu().numpy()


In [None]:
pred = pred.argmax(axis=1)

In [None]:
pred = le.classes_[pred]

In [None]:
sample_submission.Sentiment = pred

In [None]:
sample_submission.to_csv('submission.csv', index=False)