In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


# Load your dataset
df = pd.read_csv(r"C:\Users\Hannan\Downloads\stock_news.csv")
print(df['label'].unique())
# Example: mapping string labels to integers
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['label'] = df['label'].map(label_map)

# Keep only headline and label columns
df = df[['headline', 'label']].dropna()

# Split into train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)



tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

def tokenize(example):
    return tokenizer(example['headline'], truncation=True, padding='max_length', max_length=64)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)



model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=3)


# Define compute metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds, average="weighted")
    }

# Training configuration
training_args = TrainingArguments(
    output_dir="./finbert-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

trainer.save_model("./finbert-finetuned")
tokenizer.save_pretrained("./finbert-finetuned")


['Negative' 'Neutral' 'Positive']


Map:   0%|          | 0/20800 [00:00<?, ? examples/s]

Map:   0%|          | 0/5200 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,0.7894
1000,0.6467
1500,0.6183
2000,0.6186
2500,0.6014
3000,0.4995
3500,0.4744
4000,0.4399
4500,0.4649
5000,0.43


('./finbert-finetuned\\tokenizer_config.json',
 './finbert-finetuned\\special_tokens_map.json',
 './finbert-finetuned\\vocab.txt',
 './finbert-finetuned\\added_tokens.json',
 './finbert-finetuned\\tokenizer.json')

In [2]:
from transformers import pipeline

# Load pipeline using your saved model and tokenizer
sentiment_analyzer = pipeline(
    "text-classification",
    model="./finbert-finetuned",
    tokenizer="./finbert-finetuned"
)
result = sentiment_analyzer("Stock market is crashing due to inflation.")
print(result)
import pandas as pd

# Example dataframe
df = pd.DataFrame({
    'headline': [
        "Nvidia stock surges to new highs",
        "Oil prices drop sharply amid global tensions",
        "Investors cautious as recession fears grow"
    ]
})

# Apply sentiment model to each row
def analyze(text):
    result = sentiment_analyzer(text)[0]
    return pd.Series([result['label'], result['score']])

df[['sentiment', 'confidence']] = df['headline'].apply(analyze)

print(df)


Device set to use cuda:0


[{'label': 'positive', 'score': 0.9957154393196106}]
                                       headline sentiment  confidence
0              Nvidia stock surges to new highs   neutral    0.995116
1  Oil prices drop sharply amid global tensions  positive    0.994545
2    Investors cautious as recession fears grow  positive    0.992186
