### Import Modules

In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 4.9 MB 30.4 MB/s 
[K     |████████████████████████████████| 163 kB 75.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 66.6 MB/s 
[?25h

In [None]:
!pip install -q datasets pytorch_lightning 

[K     |████████████████████████████████| 432 kB 25.3 MB/s 
[K     |████████████████████████████████| 708 kB 67.3 MB/s 
[K     |████████████████████████████████| 212 kB 92.7 MB/s 
[K     |████████████████████████████████| 115 kB 75.4 MB/s 
[K     |████████████████████████████████| 127 kB 68.6 MB/s 
[K     |████████████████████████████████| 529 kB 70.1 MB/s 
[K     |████████████████████████████████| 5.9 MB 58.0 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.2+zzzcolab20220929150707 requires tensorboard<2.9,>=2.8, but you have tensorboard 2.10.1 which is incompatible.[0m
[?25h

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Data Preparation

In [None]:
### Data Source : https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10/link/0c96051eee4fb1d56e000000/download
!unzip FinancialPhraseBank-v1.0.zip

Archive:  FinancialPhraseBank-v1.0.zip
   creating: FinancialPhraseBank-v1.0/
  inflating: FinancialPhraseBank-v1.0/License.txt  
   creating: __MACOSX/
   creating: __MACOSX/FinancialPhraseBank-v1.0/
  inflating: __MACOSX/FinancialPhraseBank-v1.0/._License.txt  
  inflating: FinancialPhraseBank-v1.0/README.txt  
  inflating: __MACOSX/FinancialPhraseBank-v1.0/._README.txt  
  inflating: FinancialPhraseBank-v1.0/Sentences_50Agree.txt  
  inflating: FinancialPhraseBank-v1.0/Sentences_66Agree.txt  
  inflating: FinancialPhraseBank-v1.0/Sentences_75Agree.txt  
  inflating: FinancialPhraseBank-v1.0/Sentences_AllAgree.txt  


In [None]:
file_path = 'FinancialPhraseBank-v1.0/Sentences_75Agree.txt'
df = pd.read_csv(file_path, delimiter='@',encoding = "ISO-8859-1",index_col=None)
df.columns = ['text','sentiment']
df.head()

Unnamed: 0,text,sentiment
0,With the new production plant the company woul...,positive
1,"For the last quarter of 2010 , Componenta 's n...",positive
2,"In the third quarter of 2010 , net sales incre...",positive
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive


In [None]:
#le = preprocessing.LabelEncoder()
#df['sentiment'] = le.fit_transform(df['sentiment'])

In [None]:
df = df.reset_index(drop=True)

In [None]:
train_df, test_df = train_test_split(df,random_state=20,test_size=0.2) 

In [None]:
train_df = train_df.reset_index(drop=True)

In [None]:
test_df = test_df.reset_index(drop=True)

### Arrow Dataset Preparation

In [None]:
labels = list(set(train_df['sentiment']))
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(id2label)

{0: 'positive', 1: 'neutral', 2: 'negative'}


### Prepare the dataloader

In [None]:
from transformers import CanineTokenizer

tokenizer = CanineTokenizer.from_pretrained("google/canine-s")



In [None]:
train_ds = train_df['text'].map(lambda examples: tokenizer(examples, padding="max_length", truncation=True))

In [None]:
a = tokenizer.encode(train_df['text'][0])

In [None]:
tokenizer.decode(a)

'[CLS]UPM is talking to Myllykoski+ó s creditor banks -- Nordea ( STO : NDA ), Nordic Investment Bank and Danske Bank+ó s ( CPH : DANSKE ) Sampo Bank -- over a deal, the paper said.[SEP]'

In [None]:
test_ds = test_df['text'].map(lambda examples: tokenizer(examples, padding="max_length", truncation=True))

In [None]:
#train_ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'token_type_ids'])
#test_ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask'])

#train_ds = train_ds.rename_column(original_column_name="label", new_column_name="labels")
#test_ds = test_ds.rename_column(original_column_name="label", new_column_name="labels")

In [None]:
train_ds = train_ds.reset_index(drop=True)

In [None]:
test_ds = test_ds.reset_index(drop=True)

In [None]:
for i in range(len(train_ds)):
  train_ds[i]['label'] = train_df.loc[i,"sentiment"]

In [None]:
for i in range(len(test_ds)):
  test_ds[i]['label'] = test_df.loc[i,"sentiment"]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_ds ,batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=4)

In [None]:
batch = next(iter(train_dataloader))

In [None]:
tokenizer.decode(batch['label'][2])

ValueError: ignored

In [None]:
batch['label'][2]

'neutral'

### Define Model

In [None]:
import pytorch_lightning as pl
from transformers import CanineForSequenceClassification, AdamW
import torch.nn as nn

class Classifier(pl.LightningModule):
    def __init__(self, num_labels=10):
        super(Classifier, self).__init__()
        self.model = CanineForSequenceClassification.from_pretrained('google/canine-s', 
                                                                     num_labels=len(labels),
                                                                     id2label=id2label,
                                                                     label2id=label2id)

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                             labels=labels)

        return outputs
        
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        logits = outputs.logits

        predictions = logits.argmax(-1)
        correct = (predictions == batch['label']).sum().item()
        accuracy = correct/batch['input_ids'].shape[0]

        return loss, accuracy
      
    def training_step(self, batch, batch_idx):
        loss, accuracy = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        self.log("training_accuracy", accuracy)

        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, accuracy = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)
        self.log("validation_accuracy", accuracy, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss, accuracy = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # We could make the optimizer more fancy by adding a scheduler and specifying which parameters do
        # not require weight_decay but just using AdamW out-of-the-box works fine
        return AdamW(self.parameters(), lr=5e-5)

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return test_dataloader

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping

model = Classifier()

trainer = Trainer(callbacks=[EarlyStopping(monitor='validation_loss')])
trainer.fit(model)

Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                            | Params
----------------------------------------------------------
0 | model | CanineForSequenceClassification | 132 M 
----------------------------------------------------------
132 M     Trainable params
0         Non-trainable params
132 M     Total params
528.341   Total estimated model par

Sanity Checking: 0it [00:00, ?it/s]

AttributeError: ignored