In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers==4.18.0
!pip install numpy==1.19.5
!pip install torch==1.7.1
!pip install scikit-learn==0.20.3

In [None]:
import numpy as np
import pandas as pd 
from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('/kaggle/input/gooddata/Preprocessed.csv')
df = df.dropna()
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df

In [None]:
# Split dataset into training, validation, and testing sets
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42)

# Load the FinBERT pretrained model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain', num_labels=3) # Assuming you have 3 labels: positive, negative, neutral
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

# Prepare datasets for fine-tuning
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)


In [None]:
# Tokenize datasets
def tokenize_data(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

dataset_train = dataset_train.map(tokenize_data, batched=True)
dataset_val = dataset_val.map(tokenize_data, batched=True)
dataset_test = dataset_test.map(tokenize_data, batched=True)

In [None]:
# Set dataset format
dataset_train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_val.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [None]:
# Define training options and metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy' : accuracy_score(predictions, labels)}

args = TrainingArguments(
    output_dir='temp/',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()