In [15]:
# Sudo Code 
# Data : 
    
# Method : 
# Word Embedding : Sentence embedding 
    
# observation
# TODO : Optimize the network, validation loss increase with the decrease in the training loss


In [16]:

import pandas as pd

import torch
from torch import nn, optim
import pytorch_lightning as pl
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import torchmetrics
import os

from sentence_transformers import SentenceTransformer

from tqdm.notebook import tqdm

In [17]:
# load data 
df = pd.read_csv('data/Tweets.csv').dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [18]:
df.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [19]:
print(df.isnull().sum())
# drop nan
df.dropna(inplace=True)

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64


In [20]:
# Label target class
cat_id = {'neutral': 0, 
          'negative': 2, 
          'positive': 1}

df['class'] = df['sentiment'].map(cat_id)

In [27]:
class TweetDataset(Dataset):
    def __init__(self, data, target):
        self.data = data
        self.target = target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

class TweetDataLoader(pl.LightningDataModule):
    def __init__(self, df, batch_size, num_workers=4):
        super(TweetDataLoader, self).__init__()
        self.data = df['text'].values
        self.target = torch.tensor(df['class'].values, dtype=torch.int64)
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.vectorizer = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        embeddings = self.vectorizer.encode(self.data)

        # Split the dataset
        train_size = int(0.6 * len(embeddings))
        val_size = int(0.2 * len(embeddings))
        test_size = len(embeddings) - (train_size + val_size)

        self.train_dataset, self.val_dataset, self.test_dataset = random_split(
            TweetDataset(embeddings, self.target),
            [train_size, val_size, test_size]
        )

    def train_dataloader(self):
        return DataLoader(
            dataset=self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
#             num_workers=self.num_workers,
#             pin_memory=True,
        )

    def val_dataloader(self):
        return DataLoader(
            dataset=self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
#             num_workers=self.num_workers,
#             pin_memory=True,
        )

    def test_dataloader(self):
        return DataLoader(
            dataset=self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
#             num_workers=self.num_workers,
#             pin_memory=True,
        )


In [28]:
batch_size = 128
num_workers = 8
ds = TweetDataLoader(df, batch_size, num_workers)

In [29]:
# # Only for inspecting data 
# ds.prepare_data()
# ds.setup('test')
# for data, label in ds.train_dataloader():
#     print(data.shape,label.shape)
#     break


In [30]:
class NN(pl.LightningModule):
    
    def __init__(self,input_shape,output_shape,weight_decay=1e-3):
        super(NN,self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_shape,1000)
        self.fc2 = nn.Linear(1000,500)
        self.fc3 = nn.Linear(500,output_shape)
        self.accuracy = torchmetrics.Accuracy(task="multiclass",num_classes=output_shape)
        self.f1_score = torchmetrics.F1Score(task="multiclass",num_classes=output_shape)
        self.weight_decay = weight_decay
        self.lr= 1e-3
        
    def forward(self,x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.log_softmax(self.fc3(x),dim=1)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss, x_hat, y = self._common_step(batch,batch_idx)
        accuracy,f1_score = self.accuracy(x_hat,y), self.f1_score(x_hat,y)
        
        self.log_dict({'train_loss':loss,
                      'train_accuracy':accuracy,
                      'train_f1score':f1_score},prog_bar=True,on_step=False,on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch,batch_idx)
        
        accuracy,f1_score = self.accuracy(x_hat,y), self.f1_score(x_hat,y)
        self.log_dict({'val_loss':loss,
                      'val_accuracy':accuracy,
                      'val_f1score':f1_score},prog_bar=True,on_step=False,on_epoch=True)
        
        return loss

    def test_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch,batch_idx)
        
        accuracy,f1_score = self.accuracy(x_hat,y), self.f1_score(x_hat,y)
        self.log_dict({'test_loss':loss,
                      'test_accuracy':accuracy,
                      'test_f1score':f1_score},prog_bar=True,on_step=False,on_epoch=True)
        return loss

    def _common_step(self,batch,batch_index):
        x, y = batch
#         x = x.flatten(start_dim=1)
        x_hat = self.forward(x)
        loss = nn.functional.cross_entropy(x_hat,y)
        return loss , x_hat, y

    def predict_step(self,batch,batch_idx):
        x, y = batch
#         x = x.flatten(start_dim=1)
        x_hat = self.forward(x)
        pred = torch.argmax(x_hat,dim=1)
        return pred


    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr=self.lr,weight_decay=self.weight_decay)

In [31]:
# Hyperparametersa
input_shape = 768
output_shape = 3
batch = 512
num_epoch = 2
learning_rate = 1e-3

model = NN(input_shape,output_shape)


In [32]:

logger = pl.loggers.TensorBoardLogger(save_dir='./log/', name='sent_emb', version=0.1)

profiler = pl.profilers.PyTorchProfiler(
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/',),
    schedule=torch.profiler.schedule(skip_first=10, wait=10, warmup=1, active=2)
)

# saves top-K checkpoints based on "val_loss" metric
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    save_top_k=1,
#     save_last=True,
    monitor="val_loss",
    mode="max",
    dirpath="checkpoints/sent_emb/",
    filename="{epoch}-{val_f1score:.3f}",
)


trainer = pl.Trainer(
    logger=logger,
    accelerator='auto',
    devices=[0],
    min_epochs=10,
    max_epochs=500,
    precision='16-mixed',
#     enable_model_summary=True,
#     profiler=profiler,
    callbacks=[checkpoint_callback,
               pl.callbacks.EarlyStopping('val_loss',mode='min',patience=5,verbose=True,min_delta=0.00)],
    enable_checkpointing  = True,
)
if os.path.exists(checkpoint_callback.dirpath):
    best_checkpoint_filename = os.listdir(checkpoint_callback.dirpath)
else: 
    best_checkpoint_filename = None

if best_checkpoint_filename:
    print('Loading model from checkpoints : ',best_checkpoint_filename[0])
    trainer.fit(model, ds, ckpt_path=os.path.join(checkpoint_callback.dirpath, best_checkpoint_filename[0]))
else : 
    trainer.fit(model,datamodule=ds)

trainer.validate(model, ds);


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name     | Type               | Params
------------------------------------------------
0 | fc1      | Linear             | 769 K 
1 | fc2      | Linear             | 500 K 
2 | fc3      | Linear             | 1.5 K 
3 | accuracy | MulticlassAccuracy | 0     
4 | f1_score | MulticlassF1Score  | 0     
------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.084     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  tp = tp.sum(dim=0 if multidim_average == "global" else 1)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.727


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.717


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.008 >= min_delta = 0.0. New best score: 0.709


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.705


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.705. Signaling Trainer to stop.


Validation: 0it [00:00, ?it/s]

In [33]:
trainer.test(model, ds);

[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [14]:
!tensorboard --logdir="./log/ohe/"

I0802 09:45:25.138777 6208024576 plugin.py:429] Monitor runs begin
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.13.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
