<a href="https://colab.research.google.com/github/Ibrahim-Maiga/Datasets/blob/main/Pre_trained_transformer_model_from_Hugging_Face0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the data
url = 'https://raw.githubusercontent.com/Ibrahim-Maiga/Datasets/main/stock_data.csv'

data = pd.read_csv(url)
data['Sentiment'] = data['Sentiment'].replace(-1, 0)

# Clean the text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['cleaned_text'] = data['Text'].apply(clean_text)
data['Sentiment'] = data['Sentiment'].replace(-1, 0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['Sentiment'], test_size=0.2, random_state=42)


In [15]:
data.head(20)

Unnamed: 0,Text,Sentiment,cleaned_text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user aap movie return for the feageed indicat...
2,user I'd be afraid to short AMZN - they are lo...,1,user id be afraid to short amzn they are look...
3,MNTA Over 12.00,1,mnta over
4,OI Over 21.37,1,oi over
5,PGNX Over 3.04,1,pgnx over
6,AAP - user if so then the current downtrend wi...,0,aap user if so then the current downtrend wil...
7,Monday's relative weakness. NYX WIN TIE TAP IC...,0,mondays relative weakness nyx win tie tap ice ...
8,GOOG - ower trend line channel test & volume s...,1,goog ower trend line channel test volume sup...
9,AAP will watch tomorrow for ONG entry.,1,aap will watch tomorrow for ong entry


In [17]:
# Initialize the pipeline
pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

# Classify the sentiment of the test data
results = pipe(X_test.to_list())

# Convert results to binary labels
preds = [1 if result['label'] == 'LABEL_1' else 0 for result in results]

# Evaluate the performance
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy}')



Accuracy: 0.3684210526315789


In [18]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

# Tokenize the data
train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True, max_length=50)
test_encodings = tokenizer(X_test.to_list(), truncation=True, padding=True, max_length=50)

# Convert data to torch tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), train_labels)
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels)

# Create data loaders
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [19]:
from transformers import AdamW
from tqdm import tqdm

# Set the model to training mode
model.train()

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Training for 3 epochs
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Set the model to evaluation mode
model.eval()

# Evaluate the model
preds = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

# Calculate accuracy
accuracy = accuracy_score(true_labels, preds)
print(f'Accuracy: {accuracy}')


100%|██████████| 290/290 [18:47<00:00,  3.89s/it]
100%|██████████| 290/290 [18:53<00:00,  3.91s/it]
100%|██████████| 290/290 [18:37<00:00,  3.85s/it]
100%|██████████| 73/73 [01:18<00:00,  1.07s/it]

Accuracy: 0.8265746333045729



