### Import Modules

In [None]:
!pip install -q transformers

In [None]:
!pip install -q datasets pytorch_lightning 

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Data Preparation

In [None]:
### Data Source : https://www.researchgate.net/publication/251231364_FinancialPhraseBank-v10/link/0c96051eee4fb1d56e000000/download
!unzip FinancialPhraseBank-v1.0.zip

In [42]:
file_path = 'FinancialPhraseBank-v1.0/Sentences_75Agree.txt'
df = pd.read_csv(file_path, delimiter='@',encoding = "ISO-8859-1",index_col=None)
df.columns = ['text','sentiment']
df.head()

Unnamed: 0,text,sentiment
0,With the new production plant the company woul...,positive
1,"For the last quarter of 2010 , Componenta 's n...",positive
2,"In the third quarter of 2010 , net sales incre...",positive
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive


In [43]:
train_df, test_df = train_test_split(df,random_state=20,test_size=0.2) 

### Arrow Dataset Preparation

In [51]:
labels = list(set(train_df['sentiment']))
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(id2label)

{0: 'neutral', 1: 'negative', 2: 'positive'}


### Prepare the dataloader

In [None]:
from transformers import CanineTokenizer

tokenizer = CanineTokenizer.from_pretrained("google/canine-s")



In [54]:
train_ds = train_df['text'].map(lambda examples: tokenizer(examples, padding="max_length", truncation=True),)

In [56]:
test_ds = test_df['text'].map(lambda examples: tokenizer(examples, padding="max_length", truncation=True),)

In [None]:
train_ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask'])
test_ds.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask'])

#train_ds = train_ds.rename_column(original_column_name="label", new_column_name="labels")
#test_ds = test_ds.rename_column(original_column_name="label", new_column_name="labels")