In [1]:
# Sudo Code 
# Data : 
    
# Method : 
# Word Embedding : Weighted average TF-IDF GLoVe Model
    
# observation


In [2]:

import pandas as pd

import torch
from torch import nn, optim
from torchtext.data.utils import get_tokenizer
import pytorch_lightning as pl
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
import torchmetrics

from torchtext import vocab
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os

from tqdm.notebook import tqdm

In [3]:
## Load pre-trained word embedding vector
glove = vocab.GloVe(name='6B', dim=300)

In [4]:
# load data 
df = pd.read_csv('data/Tweets.csv').dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [5]:
df.head(5)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
print(df.isnull().sum())
# drop nan
df.dropna(inplace=True)

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64


In [7]:
# Label target class
cat_id = {'neutral': 0, 
          'negative': 2, 
          'positive': 1}

df['class'] = df['sentiment'].map(cat_id)

In [8]:
import contractions
def expand_contractions(text):
    text = text.replace('`',"'")
    expanded_text = contractions.fix(text)
    return expanded_text


def text_cleanup(text):
    # Remove dates in the format "YYYY-MM-DD" or "DD/MM/YYYY" or "MM/DD/YYYY"
    text = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "", text)
    text = re.sub(r"\b\d{2}/\d{2}/\d{4}\b", "", text)
    text = re.sub(r"\b\d{2}-\d{2}-\d{4}\b", "", text)

    # Remove times in the format "HH:MM" or "HH:MM:SS" or "HH:MM:SS.MS"
    text = re.sub(r"\b\d{2}:\d{2}\b", "", text)
    text = re.sub(r"\b\d{2}:\d{2}:\d{2}\b", "", text)
    text = re.sub(r"\b\d{2}:\d{2}:\d{2}\.\d{2,}\b", "", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    # remove punctuations
    translator = str.maketrans('', '', '"#$%&\'()*+,-./;<=>@[\\]^_`{|}~')
    return text.translate(translator)


In [9]:
# def preprocessed_df(df):
#     tokenizer = get_tokenizer('basic_english')

#     df.text = df.text.map(expand_contractions)\
#                         .map(text_cleanup)\
#                         .map(tokenizer)
#     print(df.text.map(set).map(len).describe())
#     return df
# df[df.text.map(set).map(len) == 0]

In [10]:
class TweetDataset(Dataset):
    def __init__(self, data, target):
        self.data = data
        self.target = target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

class TweetDataLoader(pl.LightningDataModule):
    def __init__(self, df, batch_size, num_workers=4):
        super(TweetDataLoader, self).__init__()
        self.data = df['text']
        self.target = torch.tensor(df['class'].values, dtype=torch.int64)
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        tokenizer = get_tokenizer('basic_english')

        tokens = self.data.map(expand_contractions)\
                        .map(text_cleanup)\
                        .map(tokenizer)
        tfidf = TfidfVectorizer()
        tfidf_vec = tfidf.fit_transform(tokens.map(' '.join)).toarray()
        features = tfidf.get_feature_names_out()
        tfdif_pd = pd.DataFrame(tfidf_vec,columns=features)
        word_imp = []
        for idx,sent in enumerate(tokens):
            word_imp.append({word : tfdif_pd[word][idx] for word in set(sent) if word in tfdif_pd.columns})
        self.embedding = [sum([value * glove[word] for word,value in document.items()])/sum(document.values()) if sum(document.values()) else glove['unknown_word'] for document in word_imp ]

    def setup(self, stage=None):
        self.prepare_data()
        vectorized_data = torch.stack(self.embedding)
        # Split the dataset
        train_size = int(0.6 * len(vectorized_data))
        val_size = int(0.2 * len(vectorized_data))
        test_size = len(vectorized_data) - (train_size + val_size)

        self.train_dataset, self.val_dataset, self.test_dataset = random_split(
            TweetDataset(vectorized_data, self.target),
            [train_size, val_size, test_size]
        )

    def train_dataloader(self):
        return DataLoader(
            dataset=self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
#             num_workers=self.num_workers,
#             pin_memory=True,
        )

    def val_dataloader(self):
        return DataLoader(
            dataset=self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
#             num_workers=self.num_workers,
#             pin_memory=True,
        )

    def test_dataloader(self):
        return DataLoader(
            dataset=self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
#             num_workers=self.num_workers,
#             pin_memory=True,
        )


In [11]:
batch_size = 128
num_workers = 8
ds = TweetDataLoader(df, batch_size, num_workers)

In [12]:
# # Only for inspecting data 
# ds.prepare_data()
# ds.setup('test')
# for data, label in ds.train_dataloader():
#     print(data.shape,label.shape)
#     break

In [20]:
class NN(pl.LightningModule):
    
    def __init__(self,input_shape,output_shape):
        super(NN,self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(input_shape,500)
        self.fc2 = nn.Linear(500,250)
        self.fc3 = nn.Linear(250,output_shape)
        self.accuracy = torchmetrics.Accuracy(task="multiclass",num_classes=output_shape)
        self.f1_score = torchmetrics.F1Score(task="multiclass",num_classes=output_shape)
        
        self.lr= 1e-3
        
    def forward(self,x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.log_softmax(self.fc3(x),dim=1)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        loss, x_hat, y = self._common_step(batch,batch_idx)
        accuracy,f1_score = self.accuracy(x_hat,y), self.f1_score(x_hat,y)
        
        self.log_dict({'train_loss':loss,
                      'train_accuracy':accuracy,
                      'train_f1score':f1_score},prog_bar=True,on_step=False,on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch,batch_idx)
        
        accuracy,f1_score = self.accuracy(x_hat,y), self.f1_score(x_hat,y)
        self.log_dict({'val_loss':loss,
                      'val_accuracy':accuracy,
                      'val_f1score':f1_score},prog_bar=True,on_step=False,on_epoch=True)
        
        return loss

    def test_step(self, batch, batch_idx):
        loss, x_hat, y = self._common_step(batch,batch_idx)
        
        accuracy,f1_score = self.accuracy(x_hat,y), self.f1_score(x_hat,y)
        self.log_dict({'test_loss':loss,
                      'test_accuracy':accuracy,
                      'test_f1score':f1_score},prog_bar=True,on_step=False,on_epoch=True)
        return loss

    def _common_step(self,batch,batch_index):
        x, y = batch
#         x = x.flatten(start_dim=1)
        x_hat = self.forward(x)
        loss = nn.functional.cross_entropy(x_hat,y)
        return loss , x_hat, y

    def predict_step(self,batch,batch_idx):
        x, y = batch
#         x = x.flatten(start_dim=1)
        x_hat = self.forward(x)
        pred = torch.argmax(x_hat,dim=1)
        return pred


    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr=self.lr)

In [21]:
# Hyperparametersa
input_shape = 300
output_shape = 3
learning_rate = 1e-3

model = NN(input_shape,output_shape)


In [23]:

logger = pl.loggers.TensorBoardLogger(save_dir='./log/', name='glove', version=0.1)

profiler = pl.profilers.PyTorchProfiler(
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/',),
    schedule=torch.profiler.schedule(skip_first=10, wait=10, warmup=1, active=2)
)

# saves top-K checkpoints based on "val_loss" metric
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    save_top_k=1,
#     save_last=True,
    monitor="val_loss",
    mode="max",
    dirpath="checkpoints/glove/",
    filename="{epoch}-{val_f1score:.3f}",
    verbose = True,
)


trainer = pl.Trainer(
    logger=logger,
    accelerator='auto',
    devices=[0],
    min_epochs=10,
    max_epochs=500,
    precision='16-mixed',
#     enable_model_summary=True,
#     profiler=profiler,
    callbacks=[checkpoint_callback,
               pl.callbacks.EarlyStopping('val_loss',mode='min',patience=20,verbose=True,min_delta=0.00)],
    enable_checkpointing  = True,
)

if os.path.exists(checkpoint_callback.dirpath):
    best_checkpoint_filename = os.listdir(checkpoint_callback.dirpath)
else: 
    best_checkpoint_filename = None

if best_checkpoint_filename:
    print('Loading model from checkpoints : ',best_checkpoint_filename[0])
    trainer.fit(model, ds, ckpt_path=os.path.join(checkpoint_callback.dirpath, best_checkpoint_filename[0]))
else : 
    trainer.fit(model,datamodule=ds)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Loading model from checkpoints :  epoch=10-val_f1score=0.602.ckpt


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
Restoring states from the checkpoint path at /Users/pranavjha/Library/CloudStorage/GoogleDrive-pranajh7@gmail.com/My Drive/Projects/applied_theories/sentiment analysis/checkpoints/glove/epoch=10-val_f1score=0.602.ckpt

  | Name     | Type               | Params
------------------------------------------------
0 | fc1      | Linear             | 150 K 
1 | fc2      | Linear             | 125 K 
2 | fc3      | Linear             | 753   
3 | accuracy | MulticlassAccuracy | 0     
4 | f1_score | MulticlassF1Score  | 0     
------------------------------------------------
276 K     Trainable params
0         Non-trainable params
276 K     Total params
1.106     Total estimated model params size (MB)
Restored all states from the checkpoint at /Users/pranavjha/Library/CloudStorage/GoogleDrive-pranajh7@gmail.com/My Drive/Projects/applied_theories/sentiment analysis/checkpoints/glove/epoch=10-val_f1score=0.602.ckpt


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.196 >= min_delta = 0.0. New best score: 0.611
Epoch 11, global step 1548: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 1677: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 1806: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 1935: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 2064: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 2193: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 2322: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 7 records. Best score: 0.611. Signaling Trainer to stop.
Epoch 18, global step 2451: 'val_loss' was not in top 1


In [None]:
trainer.validate(model, datamodule = ds);

In [31]:
trainer.test(model, datamodule=ds);

  rank_zero_warn(


Testing: 0it [00:00, ?it/s]



In [6]:
!tensorboard --logdir="./log/glove/"

^C
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/pranavjha/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
Traceback (most recent call last):
  File "/Users/pranavjha/anaconda3/bin/tensorboard", line 8, in <module>
    sys.exit(run_main())
  File "/Users/pranavjha/anaconda3/lib/python3.10/site-packages/tensorboard/main.py", line 46, in run_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/pranavjha/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 125, in _main
    prepare(preparation_data)
  File "/Users/pranavjha/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 236, in prepare
    _fixup_main_from_path(data['init_main_from_path'])
  File "/Users/pranavjha/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 287, in _fixup_main_from_path
    app.run(tensorboard.main, flags_parser=tensorboard.configure)
  File "/Users/pranavjha/anaconda3/lib/python3.10/site-packages/absl

In [26]:
import torch
torch.load('/Users/pranavjha/Library/CloudStorage/GoogleDrive-pranajh7@gmail.com/My Drive/Projects/applied_theories/sentiment analysis/checkpoints/ohe/epoch=9-val_f1score=0.683.ckpt')

{'epoch': 9,
 'global_step': 1290,
 'pytorch-lightning_version': '2.0.3',
 'state_dict': OrderedDict([('fc1.weight',
               tensor([[-1.5610e-02,  2.5858e-03, -5.4323e-32,  ..., -1.9498e-29,
                         4.2148e-29,  1.9831e-04],
                       [-1.5386e-02, -8.3821e-03, -2.2433e-03,  ..., -4.6120e-33,
                         5.3142e-04, -4.2521e-03],
                       [ 5.6590e-04,  7.0596e-06, -5.7698e-18,  ...,  5.5298e-23,
                        -3.3125e-06,  1.4006e-04],
                       ...,
                       [ 1.5866e-04,  2.4055e-10,  3.3845e-23,  ..., -1.9895e-23,
                        -3.0473e-05, -7.0479e-05],
                       [-3.4253e-04,  8.4176e-06,  1.6359e-03,  ...,  3.6487e-16,
                        -1.3511e-14,  6.4651e-04],
                       [-3.5185e-04, -2.2445e-03, -2.8573e-26,  ..., -1.0210e-31,
                         4.5169e-24,  7.6417e-07]], device='mps:0')),
              ('fc1.bias',
           