In [13]:
%reset

# Sentiment classifier

# Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

import tensorflow as tf
from tensorflow.keras import optimizers, metrics, losses

from wordcloud import WordCloud,STOPWORDS
# from datasets import DatasetDict
import re
import warnings
warnings.filterwarnings('ignore')

ImportError: cannot import name 'DataCollatorWithPadding' from 'transformers' (/Users/maksim/anaconda3/lib/python3.11/site-packages/transformers/__init__.py)

## Settings

In [None]:
# If True, loads weights of model from my_model.h5 file. No need to train again.
MODEL_SAVED = False

%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 8)
plt.style.use("bmh")
plt.rcParams["axes.facecolor"] = "white"
plt.rcParams["axes.edgecolor"] = "black"

# Dataset preparing

## Dataset building

In [None]:
# raw_data = pd.read_csv('financial_phrasebank.txt', sep='\n', header=None)
# df = raw_data.copy()

In [None]:
path = '/Users/maksim/Documents/VSE/4. Semester/TextAnalysis_1/SeminarWork/'
with open(path + 'financial_phrasebank.txt', 'r', encoding='UTF-8') as file:
    text_string = file.read()

lines = text_string.split('\n')

# Create a DataFrame from the lines
raw_data = pd.DataFrame({'text': lines})
raw_data = raw_data.iloc[:-1]
df = raw_data.copy()

The data follows text@label pattern. Separate by "@".

In [None]:
df = df['text'].str.extract(r'(.*)\@(.*)', expand=True)
df.columns = ['text', 'label']

In [None]:
df.head()

In [None]:
df.isna().sum()

No missing data

## Mapping label

In [None]:
# Mapping dictionary
mapping = {'positive': 1, 'negative': 0, 'neutral': 2}

# Apply mapping to the 'label' column
df['label'] = df['label'].map(mapping)
df.head()

## Convert as a final dataset

Splitting the data into training, test and validation as 70%, 20% and 10% respectively

In [12]:
from dask import datasets

# Splitting the dataframe into train, test, and validation parts
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.33, random_state=42)

# Convert the dataframes to datasets
train_dataset = datasets.Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = datasets.Dataset.from_pandas(test_df, preserve_index=False)
val_dataset = datasets.Dataset.from_pandas(val_df, preserve_index=False)

# Create a DatasetDict object
dataset = datasets.DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': val_dataset
})

NameError: name 'df' is not defined

In [None]:
dataset

# Data Understanding

## Label distribution

In [None]:
plt.figure(figsize=(10, 6))
df["label"].value_counts(ascending=True).plot.barh()
plt.xlabel('Labels')
plt.ylabel('Count')
plt.title('Label Distribution')
plt.xticks(rotation=45)
plt.show()

display(df['label'].value_counts())

Labels do not equally distributed. Most of the instances are neutral. Negative label is the least frequent.

## Word Clouds

Positive labels

In [None]:
# Sentiment Positive
df_pos = df[df['label']==1]
words = ' '.join(df_pos['text'].astype(str))
cleaned_word = ' '.join([word for word in words.split() if not word.startswith('@')])

wordcloud = WordCloud(background_color='white',stopwords=STOPWORDS,
                      width=3000, height=2500).generate(''.join(cleaned_word))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Negative labels

In [None]:
# Sentiment Negative
df_neg = df[df['label']==0]
words = ' '.join(df_neg['text'].astype(str))
cleaned_word = ' '.join([word for word in words.split() if not word.startswith('@')])

wordcloud = WordCloud(background_color='white',stopwords=STOPWORDS,
                      width=3000, height=2500).generate(''.join(cleaned_word))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Neutral labels

In [None]:
# Sentiment Neutral
df_neu = df[df['label']==2]
words = ' '.join(df_neu['text'].astype(str))
cleaned_word = ' '.join([word for word in words.split() if not word.startswith('@')])

wordcloud = WordCloud(background_color='white',stopwords=STOPWORDS,
                      width=3000, height=2500).generate(''.join(cleaned_word))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Data Preprocessing

## Tokenization

In [None]:
dataset.reset_format()

In [None]:
model_ckpt = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_ckpt, max_length=512)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=None)
tokenized_dataset.with_format('tensorflow')

In [None]:
print(tokenized_dataset["train"][:1])

## Model initialization

In [None]:
batch_size = 64
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
data_collator

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["label"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)

tf_valid_dataset = tokenized_dataset["validation"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["label"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator
)

tf_test_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["label"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator
)

In [None]:
num_labels = 3

def create_model():
    model = TFAutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=5e-5),
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=metrics.SparseCategoricalAccuracy())
    return model

if MODEL_SAVED:
    model = create_model()
    model.load_weights('my_model.h5')
else:
    model = create_model()

In [None]:
model.summary()

In [8]:
if not MODEL_SAVED:
    model.fit(tf_train_dataset,
              validation_data=tf_valid_dataset,
              epochs=5)

NameError: name 'MODEL_SAVED' is not defined

## Test the model

In [77]:
outputs = model.predict(tokenizer("BTC price decreased")["input_ids"])
outputs['logits'][0].tolist()
label_int = np.argmax(tf.keras.layers.Softmax()(outputs['logits'][0].tolist()))

def get_mapping_value(value):
    for key, mapped_value in mapping.items():
        if mapped_value == value:
            return key
    return None

print(get_mapping_value(label_int.item()))

negative


## Model evaluation

In [68]:
results = model.evaluate(tf_test_dataset)
print("Test set accuracy: {:.2f}%".format(results[1] * 100))

Test set accuracy: 92.22%


We can conclude that the model correctly predicted the label 92.22% of instances. It performs very well.

# Topic modeling

## Corpus preparation

In [69]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Remove stop words
stop_words = set(stopwords.words('english'))
corpus = df['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

# Create a stemmer object
stemmer = PorterStemmer()

# Perform stemming
stemmed_corpus = []
for text in corpus:
    tokens = word_tokenize(text)  # Tokenize the text into individual words
    stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Perform stemming on each token
    stemmed_text = ' '.join(stemmed_tokens)  # Join the stemmed tokens back into a string
    stemmed_corpus.append(stemmed_text)

# Remove numbers
stemmed_corpus = [re.sub(r'\d+', '', text) for text in stemmed_corpus]

print(stemmed_corpus[:2])

['accord gran , compani plan move product russia , although compani grow .', 'new product plant compani would increas capac meet expect increas demand would improv use raw materi therefor increas product profit .']


In [70]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2)
doc_term_matrix = count_vect.fit_transform(stemmed_corpus)

We specify to only include those words that appear in less than 80% of the document and appear in at least 2 documents. We also remove any numbers, perform stemming, and remove all the stop words as they do not really contribute to topic modeling.



In [71]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

The parameter n_components specifies the number of categories, or topics, that we want our text to be divided into, in our case it is 3. The parameter random_state is set to 42 so that we get the same results

In [78]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 5 words for topic #{i}:')
    print([count_vect.get_feature_names_out()[i] for i in topic.argsort()[-5:]])
    print('\n')

Top 5 words for topic #0:
['oyj', 'contract', 'said', 'finnish', 'servic']


Top 5 words for topic #1:
['sale', 'net', 'profit', 'mn', 'eur']


Top 5 words for topic #2:
['share', 'oper', 'product', 'also', 'compani']


Top 5 words for topic #3:
['finnish', 'euro', 'compani', 'mln', 'share']


Top 5 words for topic #4:
['nokia', 'finland', 'oper', 'start', 'compani']


We obtain 3 topics and 5 the most popular words per topic

# Collocation

In [79]:
words = []
for sublist in stemmed_corpus:
    matches = re.findall(r'\w+', sublist)
    words.extend(matches)

print(words[:20])

['accord', 'gran', 'compani', 'plan', 'move', 'product', 'russia', 'although', 'compani', 'grow', 'new', 'product', 'plant', 'compani', 'would', 'increas', 'capac', 'meet', 'expect', 'increas']


In [74]:
# prints the 10 most common bigrams
import nltk
colText = nltk.Text(words)
colText.collocations(10)

net sale; oper profit; correspond period; eur million; euro mln; mln
euro; oyj hel; per share; third quarter; omx helsinki


In [75]:
colBigrams = list(nltk.ngrams(colText, 2))
print("Number of words:", len(words))
print("Number of bigrams:", len(colBigrams))

Number of words: 42190
Number of bigrams: 42189


Here we  check to make sure the bigram function has gone through and counted the entire text. Having one less ngram is correct because of the way in which the ngrams are generated word-by-word in the test above

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
vectorizer = TfidfVectorizer()

# Compute the TF-IDF matrix for the corpus
tfidf_matrix = vectorizer.fit_transform(stemmed_corpus)

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Identify collocations based on TF-IDF scores
collocations = defaultdict(float)
for i, doc in enumerate(corpus):
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])

    for term_idx, score in tfidf_scores:
        term = terms[term_idx]
        collocations[term] = max(collocations[term], score)

# Sort collocations by TF-IDF score in descending order
sorted_collocations = sorted(collocations.items(), key=lambda x: x[1], reverse=True)

# Print the top collocations
top_collocations = sorted_collocations[:25]
for collocation in top_collocations:
    print(collocation)

('forecast', 1.0)
('could', 1.0)
('loan', 1.0)
('welcom', 1.0)
('think', 1.0)
('ls', 0.9461490187934268)
('thousand', 0.9147372572392014)
('sekm', 0.885271235482454)
('capman', 0.885127636876205)
('xa', 0.8571208251401715)
('aspo', 0.8548037030857015)
('catalyst', 0.8535141508484155)
('nd', 0.8377859485034687)
('kemira', 0.8310511568377097)
('billion', 0.8284977537700866)
('mln', 0.8224237824962862)
('onlin', 0.822311325120984)
('digia', 0.8183396283172262)
('dopplr', 0.8175189047653515)
('nokia', 0.8114146298096994)
('appoint', 0.8080702049604045)
('cap', 0.8012149309251502)
('nwc', 0.8012052708494896)
('eurm', 0.8011543501013417)
('sek', 0.8005964749561502)


Here is a list of top 25 words by TF-IDF score