# Text Classification on zeroshot/twitter-financial-news-sentiment from Hugging Face

Load Dataset

In [None]:
pip install datasets

In [None]:
from datasets import load_dataset

# Load the dataset by name
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")

In [None]:
# Save the dataset to a directory
dataset.save_to_disk("zeroshot/twitter-financial-news-sentiment")

In [None]:
from datasets import load_from_disk

# Load the dataset from the saved directory
dataset = load_from_disk("zeroshot/twitter-financial-news-sentiment")

In [None]:
# Print the first few examples from the dataset
for example in dataset["train"][:5]:
    print(example)

In [None]:
# Print the first few examples from the dataset
for example in dataset["validation"][:5]:
    print(example)

In [None]:
# Access a specific dataset within the DatasetDict
specific_dataset = dataset["train"]

# Print basic information about the specific dataset
print("Dataset loaded successfully!")
print("Number of examples:", len(specific_dataset))
print("Features available:", specific_dataset.features)

In [None]:
# Print the first few examples
print("\nFirst few examples:")
for i in range(min(5, len(dataset["train"]))):
    example = dataset["train"][i]
    print("Example", i+1, ":")
    print("Text:", example["text"])
    print("Label:", example["label"])
    print()

In [None]:
# Access a specific dataset within the DatasetDict
specific_dataset = dataset["validation"]

# Print basic information about the specific dataset
print("Dataset loaded successfully!")
print("Number of examples:", len(specific_dataset))
print("Features available:", specific_dataset.features)

In [None]:
# Print the first few examples
print("\nFirst few examples:")
for i in range(min(5, len(dataset["validation"]))):
    example = dataset["train"][i]
    print("Example", i+1, ":")
    print("Text:", example["text"])
    print("Label:", example["label"])
    print()

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Get the training and validation splits
train_data = dataset["train"]
val_data = dataset["validation"]

In [None]:
# Further split the validation data into validation and test sets
val_data, test_data = train_test_split(val_data, test_size=0.5, random_state=42)

# Data Preprocessing:

Tokenization: The text data is split into individual words using CountVectorizer.

In [None]:
vectorizer = CountVectorizer()


Conversion: Tokenized data is transformed into a format suitable for training by creating sparse matrices (X_train, X_val, X_test) of token counts.

In [None]:
X_train = vectorizer.fit_transform(train_data["text"])
y_train = train_data["label"]
X_val = vectorizer.transform(val_data["text"])
y_val = val_data["label"]
X_test = vectorizer.transform(test_data["text"])
y_test = test_data["label"]

# Model Training:

Classification Model: Multinomial Naive Bayes (MultinomialNB) is chosen for classification.

Pipeline: A scikit-learn pipeline (make_pipeline) integrates tokenization and model training.



In [None]:
classifier = make_pipeline(MultinomialNB())

# Training:

The pipeline is trained on the training data (X_train, y_train) using the fit() method.

In [None]:
classifier.fit(X_train, y_train)

# Model Evaluation:

Evaluation: The trained model is evaluated on the test set (X_test) to assess its performance.

Metrics: Accuracy and F1 score (weighted average for multiclass classification) are computed to measure performance.

In [None]:
y_pred = classifier.predict(X_test)


Metrics: Accuracy and F1 score (weighted average for multiclass classification) are computed to measure performance

In [None]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' average for multiclass classification

print("Model evaluation results:")
print("Accuracy:", accuracy)
print("F1 Score:", f1)



Accuracy: The proportion of correctly classified instances among all instances. In this case, it means that about 78.81% of the tweets were correctly classified by the model.

F1 Score: The weighted average of precision and recall. It is a harmonic mean of precision and recall and provides a balance between them. A higher F1 score indicates better performance, considering both false positives and false negatives.

 let's explore different types of feature engineering and model selection techniques to potentially improve the performance of model.

 Implementing TF-IDF vectorization along with a Naive Bayes classifier

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

# Data Preprocessing - TF-IDF Vectorization


In [None]:
# Step 2: Data Preprocessing - TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()

In [None]:
# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data["text"])
y_train = train_data["label"]

In [None]:
# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(val_data["text"])
X_test_tfidf = tfidf_vectorizer.transform(test_data["text"])
y_val = val_data["label"]
y_test = test_data["label"]

# Model Training - Naive Bayes Classifier

In [None]:
# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

#Model Evaluation

In [None]:
# Predict on validation and test sets
y_val_pred = nb_classifier.predict(X_val_tfidf)
y_test_pred = nb_classifier.predict(X_test_tfidf)

# Calculate performance metrics

In [None]:
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')  # Use 'weighted' average for multiclass classification
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [None]:
print("Validation Set Performance:")
print("Accuracy:", val_accuracy)
print("F1 Score:", val_f1)
print("\nTest Set Performance:")
print("Accuracy:", test_accuracy)
print("F1 Score:", test_f1)

# Analysis:

Accuracy Evaluation: The accuracy scores for both the validation and test sets are relatively modest, indicating that the model correctly classified approximately 69% and 72% of the tweets in the respective sets.


F1 Score Examination: The F1 scores, which consider both precision and recall, provide a more balanced assessment of the model's performance. With F1 scores of around 60% for the validation set and 63.50% for the test set, the model demonstrates moderate effectiveness in capturing the true positive and true negative cases.

To improvement more lets Try Word Embedding with Word2vec with Logistic Regression

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

#Data Preprocessing - Word Embeddings with Word2Vec

In [None]:
# Train Word2Vec model on the training data
sentences = [text.split() for text in train_data["text"]]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Function to average Word2Vec vectors for each sentence
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

In [None]:
# Convert text data to Word2Vec embeddings
X_train_word2vec = np.array([average_word_vectors(text.split(), word2vec_model, word2vec_model.wv.index_to_key, 100) for text in train_data["text"]])
X_val_word2vec = np.array([average_word_vectors(text.split(), word2vec_model, word2vec_model.wv.index_to_key, 100) for text in val_data["text"]])
X_test_word2vec = np.array([average_word_vectors(text.split(), word2vec_model, word2vec_model.wv.index_to_key, 100) for text in test_data["text"]])
y_train = train_data["label"]
y_val = val_data["label"]
y_test = test_data["label"]

#Model Training - Logistic Regression

In [None]:
# Initialize and train the logistic regression classifier
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train_word2vec, y_train)

# Model Evaluation

In [None]:
# Predict on validation and test sets
y_val_pred = logistic_regression.predict(X_val_word2vec)
y_test_pred = logistic_regression.predict(X_test_word2vec)

In [None]:
# Calculate performance metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')  # Use 'weighted' average for multiclass classification
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [None]:
print("Validation Set Performance:")
print("Accuracy:", val_accuracy)
print("F1 Score:", val_f1)
print("\nTest Set Performance:")
print("Accuracy:", test_accuracy)
print("F1 Score:", test_f1)

# Analysis:

Accuracy Assessment:
The accuracy scores on both the validation and test sets are relatively low, with approximately 66% accuracy on the validation set and 68% accuracy on the test set. This suggests that the model correctly classified only about two-thirds of the tweets in each set, indicating a modest level of performance.
F1 Score Evaluation:

The F1 scores, which consider both precision and recall, provide a more comprehensive evaluation of the model's performance. With F1 scores of around 56% for the validation set and 58% for the test set, the model demonstrates moderate effectiveness in capturing both true positive and true negative cases.

# Model Deployment

After experimenting with various models for sentiment analysis of financial news tweets, we found that the "classifier" model achieved the highest accuracy. Therefore, we have decided to deploy this model for practical use.

# Save the trained model

In [None]:
import joblib

# Save both the classifier and the vectorizer
joblib.dump((classifier, vectorizer), 'sentiment_classifier.joblib')

In [None]:
import joblib
# Load both the classifier and the vectorizer
classifier, vectorizer = joblib.load('sentiment_classifier.joblib')