<a href="https://colab.research.google.com/github/Ibrahim-Maiga/Datasets/blob/main/Pre_trained_transformer_model_from_Hugging_Face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load the data
url = 'https://raw.githubusercontent.com/Ibrahim-Maiga/Datasets/main/stock_data.csv'
data = pd.read_csv(url)

# Clean the text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['Sentiment'] = data['Sentiment'].replace(-1, 0)
data['cleaned_text'] = data['Text'].apply(clean_text)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['Sentiment'], test_size=0.2, random_state=42)

# Initialize the pipeline
pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

# Classify the sentiment of the test data
results = pipe(X_test.to_list())

# Convert results to binary labels
preds = [1 if result['label'] == 'LABEL_1' else 0 for result in results]

# Evaluate the performance
accuracy = accuracy_score(y_test, preds)
print(f'Pipeline Accuracy: {accuracy}')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

# Tokenize the data
train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True, max_length=50)
test_encodings = tokenizer(X_test.to_list(), truncation=True, padding=True, max_length=50)

# Convert data to torch tensors
train_labels = torch.tensor(y_train.values, dtype=torch.long)
test_labels = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), train_labels)
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set the model to training mode
model.train()

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(3):  # Training for 3 epochs
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

# Set the model to evaluation mode
model.eval()

# Evaluate the model
preds = []
true_labels = []
eval_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = outputs.loss
        eval_loss += loss.item()
        preds.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

# Calculate accuracy and average loss
accuracy = accuracy_score(true_labels, preds)
avg_loss = eval_loss / len(test_loader)

print(f'Accuracy: {accuracy}')
print(f'Average Loss: {avg_loss}')


Pipeline Accuracy: 0.3684210526315789


100%|██████████| 290/290 [20:28<00:00,  4.23s/it]
100%|██████████| 290/290 [20:19<00:00,  4.20s/it]
100%|██████████| 290/290 [20:17<00:00,  4.20s/it]
100%|██████████| 73/73 [01:26<00:00,  1.18s/it]

Accuracy: 0.8153580672993961
Average Loss: 0.456393631124129





In [18]:
from sklearn.metrics import accuracy_score, recall_score
# Calculate accuracy and average loss
accuracy = accuracy_score(true_labels, preds)
avg_loss = eval_loss / len(test_loader)

# Calculate recall
recall = recall_score(true_labels, preds)

print(f'Accuracy: {accuracy}')
print(f'Average Loss: {avg_loss}')
print(f'Recall: {recall}')

Accuracy: 0.8153580672993961
Average Loss: 0.456393631124129
Recall: 0.8565573770491803
