In [1]:
pip install transformers datasets torch -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers[torch] -q

zsh:1: no matches found: transformers[torch]
Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset

dataset = load_dataset('NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed')
dataset

DatasetDict({
    train: Dataset({
        features: ['symbol', 'publishedDate', 'title', 'image', 'site', 'text', 'url', 'sentiment', 'sentimentScore'],
        num_rows: 142000
    })
})

In [4]:
def preprocess_dataset(dataset):
    dataset = dataset.map(
        lambda example: {'label': example['sentiment'], 'sentence': example['text']},
        remove_columns=['symbol', 'publishedDate', 'title', 'image', 'site', 'url', 'sentimentScore', 'text', 'sentiment']
    )
    return dataset

processed_dataset = preprocess_dataset(dataset)

processed_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'sentence'],
        num_rows: 142000
    })
})

In [5]:
# Define the label mapping
label_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}

def map_labels(example):
    example["label"] = label_mapping[example["label"]]
    return example

processed_dataset = processed_dataset.map(map_labels)

processed_dataset['train'][0]

{'label': 0, 'sentence': 'RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) -- The law firm of Kessler Topaz Meltzer & Check, LLP (www.ktmc.com) informs investors that a securities class action lawsuit has been filed in the United States District Court for the Northern District of California against Hawaiian Electric Industries, Inc. ("Hawaiian Electric") (NYSE:HE). The action charges Hawaiian Electric with violations of the federal securities laws, including omissions and fraudulent misrepresentations relating to the compa...'}


In [6]:
import pandas as pd

df = pd.DataFrame(processed_dataset['train'])
df

Unnamed: 0,label,sentence
0,0,"RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) --..."
1,2,"PHILADELPHIA, Oct. 04, 2023 (GLOBE NEWSWIRE) -..."
2,2,BlackBerry Ltd (NYSE: BB) shares are trading h...
3,2,"VANCOUVER, British Columbia, Oct. 04, 2023 (GL..."
4,2,"TORONTO, Oct. 04, 2023 (GLOBE NEWSWIRE) -- AGF..."
...,...,...
141995,2,"SAN DIEGO, Aug. 12, 2022 (GLOBE NEWSWIRE) -- a..."
141996,2,"FORT LAUDERDALE, Fla., Aug. 12, 2022 (GLOBE NE..."
141997,2,"CHATTANOOGA, Tenn., Aug. 12, 2022 (GLOBE NEWSW..."
141998,2,"SINGAPORE, Aug. 12, 2022 (GLOBE NEWSWIRE) -- G..."


In [7]:
from datasets import DatasetDict

# Split the dataset into 80% train and 20% test
train_test_split = processed_dataset['train'].train_test_split(test_size=0.2)

# Create a new dataset dictionary
train_dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

train_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'sentence'],
        num_rows: 113600
    })
    test: Dataset({
        features: ['label', 'sentence'],
        num_rows: 28400
    })
})


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [9]:
def preprocess_function(examples):
    return tokenizer(
        examples['sentence'], 
        truncation=True, 
        padding="max_length", 
        max_length=128 
    )

tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/113600 [00:00<?, ? examples/s]

Map:   0%|          | 0/28400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'sentence', 'input_ids', 'attention_mask'],
        num_rows: 113600
    })
    test: Dataset({
        features: ['label', 'sentence', 'input_ids', 'attention_mask'],
        num_rows: 28400
    })
})

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)



In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3185,0.294045
2,0.2817,0.298364
3,0.2501,0.302255
4,0.1864,0.38053
5,0.1791,0.409072


TrainOutput(global_step=71000, training_loss=0.2511198636981803, metrics={'train_runtime': 2594.9733, 'train_samples_per_second': 218.885, 'train_steps_per_second': 27.361, 'total_flos': 1.8810706065408e+16, 'train_loss': 0.2511198636981803, 'epoch': 5.0})

In [18]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)  # Convert logits to class predictions
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

results = trainer.evaluate()
results

{'eval_loss': 0.4090724587440491, 'eval_model_preparation_time': 0.0016, 'eval_accuracy': 0.9171126760563381, 'eval_runtime': 19.7016, 'eval_samples_per_second': 1441.506, 'eval_steps_per_second': 180.188}


In [14]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [15]:
model.push_to_hub("msr2903/mrm8488-distilroberta-fine-tuned-financial-sentiment")
tokenizer.push_to_hub("msr2903/mrm8488-distilroberta-fine-tuned-financial-sentiment")

print("Model uploaded to Hugging Face Hub!")

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model uploaded to Hugging Face Hub!
