In [1]:
!pip install datasets



In [11]:
import pandas as pd
import numpy as np

from torchsummary import summary
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm import tqdm
from datasets import Dataset
import random

from sklearn.metrics import f1_score

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
SEED = 0xDEAD
random.seed(SEED)
np.random.seed(SEED)
torch.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)

In [13]:
#!pip install torchsummary

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

#### Data

In [15]:
train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,text,label
0,?ÂThe past 20 years have been a rollercoaster...,Neutral
1,I figured it out COVID 19 is a conspiracy form...,Negative
2,Local law enforcement agencies are working tog...,Neutral
3,This whole stay indoors or no gathering is stu...,Negative
4,@narendramodi Sir I am not panic respect &amp;...,Positive


In [16]:
valid = pd.read_csv('valid_data.csv')
valid.head()

Unnamed: 0,text,label
0,So panic buying of soaps and toilet roll was b...,Extremely Negative
1,I would place a large amount of money this is ...,Neutral
2,"""Saudi Arabia is bracing for an economic downt...",Neutral
3,#Foodsecurity #coronavirus #covid19 \r\r\n1.Sm...,Extremely Positive
4,I d like to know who is stockpiling eggs chees...,Extremely Positive


In [17]:
test = pd.read_csv('test.csv')[['Text']].rename(columns={"Text": "text"})
test.head()

Unnamed: 0,text
0,TRENDING: New Yorkers encounter empty supermar...
1,When I couldn't find hand sanitizer at Fred Me...
2,Find out how you can protect yourself and love...
3,#Panic buying hits #NewYork City as anxious sh...
4,#toiletpaper #dunnypaper #coronavirus #coronav...


#### Preprocess

In [20]:
# Convert the stop words to a set for faster lookup
stopwords_set = set(nltk.corpus.stopwords.words('english'))
# Define a function to clean and process the text
def clean_text(text):
    # Remove all characters that are not Latin letters
    #text = re.sub(r'[^a-zA-Z ]+', '', text)
    # Convert the text to lower case
    text = text.lower()
    # Split the text into words
    #words = text.split()
    # Remove stop words
    #words = [word for word in words if word not in stopwords_set]
    # Join the words back into a single string
    cleaned_text = text# " ".join(words)
    return cleaned_text

def maping(x):
    if x=='Neutral':
        return 0
    elif x=='Positive':
        return 1
    elif x=='Extremely Positive':
        return 2
    elif x=='Negative':
        return 3
    else:
        return 4

In [21]:
# Apply the clean_text function to the "text" column of the articles table
train["text"] = train["text"].apply(clean_text)
train.label = train.label.apply(lambda x: maping(x))

valid["text"] = valid["text"].apply(clean_text)
valid.label = valid.label.apply(lambda x: maping(x))

test["text"] = test["text"].apply(clean_text)

#### Tokenize

In [22]:
tokenizer = nltk.WordPunctTokenizer()
max_token_length = train['text'].apply(lambda x: len(nltk.word_tokenize(x))).mean()
max_token_length

37.292917021018106

In [23]:
MAX_LENGTH = 72 # max_token_length + 2

def tokenize_text(item):
    text = item["text"]
    tokenized_text = tokenizer.tokenize(text)[:MAX_LENGTH]
    return {"tokenized": tokenized_text}

# Convert to Hugging Face Dataset
train = Dataset.from_pandas(train)
valid = Dataset.from_pandas(valid)
test = Dataset.from_pandas(test)

train = train.map(tokenize_text)
valid = valid.map(tokenize_text)
test = test.map(tokenize_text)

Map:   0%|          | 0/32924 [00:00<?, ? examples/s]

Map:   0%|          | 0/8231 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [24]:
train.to_pandas()

Unnamed: 0,text,label,tokenized
0,?âthe past 20 years have been a rollercoaster...,0,"[?, â, , the, past, 20, years, have, been, a,..."
1,i figured it out covid 19 is a conspiracy form...,3,"[i, figured, it, out, covid, 19, is, a, conspi..."
2,local law enforcement agencies are working tog...,0,"[local, law, enforcement, agencies, are, worki..."
3,this whole stay indoors or no gathering is stu...,3,"[this, whole, stay, indoors, or, no, gathering..."
4,@narendramodi sir i am not panic respect &amp;...,1,"[@, narendramodi, sir, i, am, not, panic, resp..."
...,...,...,...
32919,minnesota classifies grocery store workers as ...,3,"[minnesota, classifies, grocery, store, worker..."
32920,us senator @ewarren has asked for information ...,3,"[us, senator, @, ewarren, has, asked, for, inf..."
32921,register for our upcoming @sourcingjournal #we...,1,"[register, for, our, upcoming, @, sourcingjour..."
32922,my wife got laid off yesterday because the sma...,0,"[my, wife, got, laid, off, yesterday, because,..."


#### Digitize

In [25]:
import gensim.downloader as api
word2vec = api.load('glove-wiki-gigaword-200')#('glove-wiki-gigaword-200')#("fasttext-wiki-news-subwords-300")#("glove-twitter-25")#'glove-wiki-gigaword-300')#("glove-twitter-50")#
embed_size = 200

In [26]:
word2idx = {word: idx for idx, word in enumerate(word2vec.index_to_key)}

In [27]:
def encode(word):
    if word in word2idx.keys():
        return word2idx[word]
    return word2idx["unk"]

In [28]:
train = train.map(
    lambda item: {
        "features": [encode(word) for word in item["tokenized"]]
    }, remove_columns=["text", "tokenized"]
)

Map:   0%|          | 0/32924 [00:00<?, ? examples/s]

In [29]:
valid = valid.map(
    lambda item: {
        "features": [encode(word) for word in item["tokenized"]]
    }, remove_columns=["text", "tokenized"]
)

Map:   0%|          | 0/8231 [00:00<?, ? examples/s]

In [30]:
test = test.map(
    lambda item: {
        "features": [encode(word) for word in item["tokenized"]]
    }, remove_columns=["text", "tokenized"]
)

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [31]:
train.to_pandas().head()

Unnamed: 0,label,features
0,0,"[188, 157800, 201534, 0, 341, 324, 82, 33, 51,..."
1,3,"[41, 8154, 20, 66, 201534, 904, 14, 7, 4642, 1..."
2,0,"[250, 264, 2500, 1848, 32, 500, 600, 4, 1508, ..."
3,3,"[37, 1115, 1087, 12944, 46, 84, 3487, 14, 8979..."
4,1,"[17527, 201534, 2699, 41, 913, 36, 6114, 1983,..."


In [32]:
train.set_format(type='torch')#,  device="cuda")
valid.set_format(type='torch')#,  device="cuda")

In [33]:
def custom_collate_fn(batch):
    labels = torch.tensor([item["label"] for item in batch])
    features = [item["features"] for item in batch]

    # Pad sequences to the maximum length in the batch
    max_seq_length = max(len(seq) for seq in features)
    padded_features = torch.zeros(len(features), max_seq_length, dtype=torch.long)

    for i, seq in enumerate(features):
        padded_features[i, :len(seq)] = torch.tensor(seq)

    return {
        "label": labels,
        "features": padded_features
    }

In [43]:
batch_size = 32
train_loader = DataLoader(train, batch_size=batch_size, collate_fn=custom_collate_fn, shuffle=True)
valid_loader = DataLoader(valid, batch_size=batch_size, collate_fn=custom_collate_fn)

loaders = {'train': train_loader, 'valid': valid_loader}

In [44]:
train.shape

(32924, 2)

#### Model

In [79]:
class Seq2SeqClassifier(nn.Module):
    def __init__(self, embedding_matrix, num_classes, hidden_size):
        super(Seq2SeqClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.encoder = nn.LSTM(input_size=embedding_matrix.shape[1], hidden_size=hidden_size, num_layers=2, dropout=0.5, batch_first=True, bidirectional=True)
        self.self_attention = nn.MultiheadAttention(embed_dim=2*hidden_size, num_heads=16)
        self.classifier = nn.Linear(2*hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        encoder_outputs, _ = self.encoder(embedded)
        attn_output, _ = self.self_attention(encoder_outputs.transpose(0, 1), encoder_outputs.transpose(0, 1), encoder_outputs.transpose(0, 1))
        output = self.classifier(attn_output.mean(dim=0))
        return output

# Instantiate the model
embedding_matrix = torch.from_numpy(word2vec.vectors).to(device)
model = Seq2SeqClassifier(embedding_matrix, num_classes=5, hidden_size=256).to(device)


In [80]:
embedding_matrix.shape

torch.Size([400000, 200])

In [81]:
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR

In [86]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, amsgrad=False)#RMSprop(model.parameters(), lr=0.001)#
scheduler = StepLR(optimizer, step_size=2, gamma=0.1, verbose=True)
#scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min', patience=0, verbose=True, factor=0.1)
#optim.Adam(model.parameters(), lr=0.001)
#scheduler = ReduceLROnPlateau(optimizer, 'min')

Adjusting learning rate of group 0 to 1.0000e-03.


In [87]:
def freeze_embeddings(model, req_grad=False):
    embeddings = model.embedding
    for c_p in embeddings.parameters():
        c_p.requires_grad = req_grad

In [88]:
# Train the model
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    train_pbar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}")
    for batch in train_pbar:
        inputs = batch['features'].to(device)
        labels = batch['label'].to(device)

        if epoch < 13:
            freeze_embeddings(model, True)
        else:
            freeze_embeddings(model)


        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_pbar.set_postfix({'Training Loss': loss.item()})

    # Validation
    model.eval()
    all_preds = []
    all_labels = []
    valid_loss = 0
    valid_pbar = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}")
    with torch.no_grad():
        for batch in valid_pbar:
            inputs = batch['features'].to(device)
            labels = batch['label'].to(device)
            outputs = model(inputs)
            valid_loss += criterion(outputs, labels)
            preds = torch.softmax(outputs,dim=1).argmax(dim=1)#torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    f1 = f1_score(all_labels, all_preds, average='macro')
    average_val_loss=valid_loss / len(valid_loader)

    #scheduler.step(average_val_loss)
    scheduler.step()
    print(f'Epoch {epoch+1}, Validation Loss: {average_val_loss}, Validation F1 Score: {f1}')
    #print(f'Lerning rate: {scheduler.get_last_lr()[0]}')
    #print(optimizer.param_groups[0]['lr'])


  padded_features[i, :len(seq)] = torch.tensor(seq)
Training Epoch 1: 100%|██████████| 1029/1029 [00:20<00:00, 51.43it/s, Training Loss=0.578]
Validation Epoch 1: 100%|██████████| 258/258 [00:01<00:00, 176.95it/s]


Adjusting learning rate of group 0 to 1.0000e-03.
Epoch 1, Validation Loss: 0.48604562878608704, Validation F1 Score: 0.8270981717297372


  padded_features[i, :len(seq)] = torch.tensor(seq)
Training Epoch 2: 100%|██████████| 1029/1029 [00:19<00:00, 51.54it/s, Training Loss=0.412]
Validation Epoch 2: 100%|██████████| 258/258 [00:01<00:00, 174.73it/s]


Adjusting learning rate of group 0 to 1.0000e-04.
Epoch 2, Validation Loss: 0.4974519610404968, Validation F1 Score: 0.8279556154521485


  padded_features[i, :len(seq)] = torch.tensor(seq)
Training Epoch 3: 100%|██████████| 1029/1029 [00:19<00:00, 51.58it/s, Training Loss=0.116]
Validation Epoch 3: 100%|██████████| 258/258 [00:01<00:00, 175.92it/s]


Adjusting learning rate of group 0 to 1.0000e-04.
Epoch 3, Validation Loss: 0.44814857840538025, Validation F1 Score: 0.863437375449789


  padded_features[i, :len(seq)] = torch.tensor(seq)
Training Epoch 4: 100%|██████████| 1029/1029 [00:19<00:00, 51.77it/s, Training Loss=0.139]
Validation Epoch 4: 100%|██████████| 258/258 [00:01<00:00, 175.71it/s]

Adjusting learning rate of group 0 to 1.0000e-05.
Epoch 4, Validation Loss: 0.47709256410598755, Validation F1 Score: 0.8622769774623326





#### Inference

In [89]:
def collate_fn_test(batch):
    max_len = max(len(row["features"]) for row in batch)
    input_embeds = torch.empty((len(batch), max_len), dtype=torch.long, device=device)  # Move the tensor to the GPU

    for idx, row in enumerate(batch):
        to_pad = max_len - len(row["features"])
        input_embeds[idx] = torch.cat((row["features"], torch.zeros(to_pad).to(device=device)))

    return {"features": input_embeds}

In [90]:
device

'cuda'

In [91]:
test.set_format("torch", device=device)
test_loader = DataLoader(test, batch_size=32, collate_fn=collate_fn_test)

In [92]:
predictions = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['features']#.to('cuda')  # Move input_ids to the GPU
        logits = model(input_ids).to('cpu')
        predicted_class = torch.argmax(logits, dim=1)
        predictions.extend(predicted_class.cpu().numpy())

print(predictions)

[3, 1, 2, 3, 0, 0, 1, 3, 4, 2, 1, 4, 4, 2, 1, 3, 1, 0, 1, 2, 3, 2, 2, 1, 4, 1, 3, 4, 3, 1, 3, 2, 4, 4, 3, 4, 3, 3, 3, 2, 1, 4, 3, 1, 1, 2, 2, 3, 4, 4, 4, 0, 3, 1, 3, 3, 3, 2, 1, 0, 4, 4, 2, 4, 1, 1, 2, 2, 1, 1, 4, 2, 1, 3, 0, 4, 3, 3, 1, 1, 4, 4, 2, 3, 1, 3, 3, 0, 3, 1, 3, 3, 1, 3, 3, 2, 0, 2, 4, 0, 1, 0, 0, 1, 0, 2, 1, 3, 3, 3, 3, 1, 2, 0, 2, 3, 3, 2, 3, 0, 1, 1, 3, 3, 4, 4, 1, 2, 3, 4, 3, 1, 4, 4, 4, 1, 2, 3, 2, 3, 2, 2, 0, 3, 1, 1, 3, 3, 3, 1, 3, 2, 1, 3, 0, 3, 3, 3, 1, 4, 1, 1, 1, 0, 0, 1, 3, 1, 1, 2, 3, 2, 0, 1, 2, 1, 0, 1, 3, 0, 0, 2, 0, 1, 3, 2, 0, 1, 1, 3, 2, 1, 1, 2, 3, 1, 0, 1, 2, 3, 3, 1, 1, 2, 1, 3, 3, 2, 0, 1, 4, 3, 0, 1, 1, 3, 3, 0, 0, 4, 0, 1, 3, 3, 1, 1, 1, 4, 1, 4, 4, 1, 3, 0, 0, 1, 2, 0, 0, 1, 2, 1, 3, 3, 3, 4, 0, 3, 0, 3, 0, 0, 1, 2, 1, 1, 2, 1, 2, 4, 3, 2, 4, 2, 3, 0, 1, 3, 1, 1, 0, 2, 0, 4, 2, 2, 3, 4, 2, 3, 2, 0, 1, 1, 4, 1, 1, 0, 4, 4, 3, 3, 3, 4, 3, 2, 1, 0, 1, 0, 0, 4, 4, 3, 3, 2, 4, 4, 4, 3, 3, 1, 3, 3, 4, 3, 1, 4, 2, 1, 0, 3, 4, 3, 3, 2, 2, 3, 3, 1, 0, 1, 0, 

In [93]:
Id = pd.read_csv('test.csv')[['id']]
predictions_ = pd.DataFrame(predictions, columns=['Sentiment'])
predictions = pd.concat([Id, predictions_], axis=1)

def maping(x):
    if x==0:
        return 'Neutral'
    elif x==1:
        return 'Positive'
    elif x==2:
        return 'Extremely Positive'
    elif x==3:
        return 'Negative'
    else:
        return 'Extremely Negative'

predictions.Sentiment = predictions.Sentiment.apply(lambda x: maping(x))

In [95]:
predictions.to_csv('Predictions.csv', index=False)
predictions = pd.read_csv('Predictions.csv')
predictions

Unnamed: 0,id,Sentiment
0,787bc85b-20d4-46d8-84a0-562a2527f684,Negative
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,Positive
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Extremely Positive
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Negative
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,Neutral
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Positive
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Negative
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Neutral
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Extremely Negative
