In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ankurzing/sentiment-analysis-for-financial-news?dataset_version_number=5...


100%|██████████| 903k/903k [00:00<00:00, 14.4MB/s]

Extracting files...
Path to dataset files: /Users/marius/.cache/kagglehub/datasets/ankurzing/sentiment-analysis-for-financial-news/versions/5





In [2]:
import os

print("Path to dataset files:", path)
print("Files in dataset directory:", os.listdir(path))

Path to dataset files: /Users/marius/.cache/kagglehub/datasets/ankurzing/sentiment-analysis-for-financial-news/versions/5
Files in dataset directory: ['FinancialPhraseBank', 'all-data.csv']


In [4]:
import pandas as pd

file_path = os.path.join(path, "all-data.csv")

columns = ["Sentiment", "News Headline"]

full_data = pd.read_csv(file_path, encoding='latin-1', names=columns)

In [8]:
#we are interested in the labels and tweets only

data = full_data[['Sentiment', 'News Headline']].copy()
data.rename(columns={"Sentiment": "label"}, inplace=True)
data.rename(columns={"News Headline": "text"}, inplace=True)

print(data[:10])

      label                                               text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...
5  positive  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
6  positive  For the last quarter of 2010 , Componenta 's n...
7  positive  In the third quarter of 2010 , net sales incre...
8  positive  Operating profit rose to EUR 13.1 mn from EUR ...
9  positive  Operating profit totalled EUR 21.1 mn , up fro...


In [9]:
# Preprocessing function without parallelization (on my hardware took ~3min)

''' Uncomment this if you want to run the preprocessing without parallelization!

def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize

    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens) 

data['processed_text'] = data['text'].apply(preprocess_text)
print(data.head())

'''

' Uncomment this if you want to run the preprocessing without parallelization!\n\ndef preprocess_text(text):\n    # Tokenize text\n    tokens = word_tokenize(text.lower())  # Lowercase and tokenize\n\n    # Remove punctuation and non-alphabetic characters\n    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]\n    tokens = [token for token in tokens if token]  # Remove empty strings\n\n    # Remove stopwords\n    stop_words = set(stopwords.words(\'english\'))\n    filtered_tokens = [word for word in tokens if word not in stop_words]\n\n    # Lemmatize tokens\n    lemmatizer = WordNetLemmatizer()\n    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n\n    return \' \'.join(lemmatized_tokens) \n\ndata[\'processed_text\'] = data[\'text\'].apply(preprocess_text)\nprint(data.head())\n\n'

In [10]:
# Preprocessing function with parallelization (on my hardware took ~2min)
# Roughly a 33% speed increase

from joblib import Parallel, delayed
import nltk
import pandas as pd
import re

nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    # Tokenize text
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize

    # Remove punctuation and non-alphabetic characters
    tokens = [re.sub(r"[^a-zA-Z]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatized_tokens)  # Return processed text as a single string

def preprocess_and_return(row):
    return preprocess_text(row['text'])

data['processed_text'] = Parallel(n_jobs=-1)(delayed(preprocess_and_return)(row) for index, row in data.iterrows())

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [11]:
first_tweet = data.iloc[0]
print("Original Tweet:", first_tweet['text'])
print("Processed Tweet:", first_tweet['processed_text'])

Original Tweet: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Processed Tweet: according gran company plan move production russia although company growing


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

class SentimentNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # Input to hidden layer
        self.fc2 = nn.Linear(hidden_dim, output_dim)  # Hidden to output layer
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x  # Output logits for multi-class classification

# Hyperparameters
embedding_dim = 50
hidden_dim = 32
output_dim = 3  # Output dimension is 3 for multi-class classification (negative, neutral, positive)

# Assume 'data' is a DataFrame with 'text' and 'label' columns
# Map string labels to numeric
label_map = {"negative": 0, "neutral": 1, "positive": 2}
data['label'] = data['label'].map(label_map)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(train_data['text']).toarray()
X_test_tfidf = vectorizer.transform(test_data['text']).toarray()

# Convert to tensors
X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)

y_train_tensor = torch.tensor(train_data['label'].values, dtype=torch.long)
y_test_tensor = torch.tensor(test_data['label'].values, dtype=torch.long)

# Model setup
model = SentimentNN(input_dim=1000, hidden_dim=64, output_dim=3)  # Three output classes

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
batch_size = 64
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, labels)
        running_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Get predictions and compute accuracy
        _, predicted = torch.max(outputs, 1)  # Get class with max probability
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

# Evaluation on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    correct_predictions = (predicted == y_test_tensor).sum().item()
    total_predictions = y_test_tensor.size(0)
    accuracy = correct_predictions / total_predictions

print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/10, Loss: 0.7250, Accuracy: 0.9948
Epoch 2/10, Loss: 0.1144, Accuracy: 1.0000
Epoch 3/10, Loss: 0.0228, Accuracy: 1.0000
Epoch 4/10, Loss: 0.0084, Accuracy: 1.0000
Epoch 5/10, Loss: 0.0037, Accuracy: 1.0000
Epoch 6/10, Loss: 0.0019, Accuracy: 1.0000
Epoch 7/10, Loss: 0.0011, Accuracy: 1.0000
Epoch 8/10, Loss: 0.0007, Accuracy: 1.0000
Epoch 9/10, Loss: 0.0005, Accuracy: 1.0000
Epoch 10/10, Loss: 0.0004, Accuracy: 1.0000
Test Accuracy: 1.0000


In [25]:
def predict_sentiment(input_text, model, vectorizer):
    # Preprocess input (apply same preprocessing as during training)
    # If you had specific preprocessing (like lowercasing, removing stopwords, etc.), apply here
    processed_text = input_text.lower()  # Example preprocessing step

    # Convert text to TF-IDF features using the trained vectorizer
    input_tfidf = vectorizer.transform([processed_text]).toarray()

    # Convert to PyTorch tensor
    input_tensor = torch.tensor(input_tfidf, dtype=torch.float32)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation since we are not training
    with torch.no_grad():
        # Pass the input through the model
        outputs = model(input_tensor)

        # Get predicted class (index of the highest value)
        _, predicted_class = torch.max(outputs, 1)

        # Map predicted class back to original label
        label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
        predicted_label = label_mapping[predicted_class.item()]

    return predicted_label

# Example usage
input_text = "I don't really mind this, but usually I dont mind this"
predicted_sentiment = predict_sentiment(input_text, model, vectorizer)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: negative
