In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
import re

2022-05-31 18:16:00.242986: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-31 18:16:00.243012: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Put the datasets in a dataframes
# Make sure you put financial-dataset.csv and all-data.csv in your colab space, or locally
df1 = pd.read_csv("datasets/financial-dataset.csv", encoding="ISO-8859-1")
df2 = pd.read_csv("datasets/all-data.csv", encoding="ISO-8859-1")
df3 = pd.read_csv("datasets/stock_data.csv", encoding="ISO-8859-1")

# Pre-process the stock_data dataset to have the same features as the other two datasets (Sentiment, Sentence)
df3 = df3.rename(columns = {'Text': 'Sentence'})
df3['Sentiment'] = df3['Sentiment'].replace(1,'positive')
df3['Sentiment'] = df3['Sentiment'].replace(-1,'negative')
df1 = df1[df1['Sentiment'] != 'neutral'] 
df2 = df2[df2['Sentiment'] != 'neutral']

# Combine dfs
df = pd.concat([df1, df2, df3], ignore_index=True)


In [3]:
# Print important info for understanding data
print('Number of records with each sentiment:\n')
print('Positive:')
print(len(df[df["Sentiment"] == "positive"]), '\n')
print('Negative:')
print(len(df[df["Sentiment"] == "negative"]), '\n')
print('Neutral:')
print(len(df[df["Sentiment"] == "neutral"]), '\n')
print('Data example:\n')
df.tail(10)

Number of records with each sentiment:

Positive:
6900 

Negative:
3570 

Neutral:
0 

Data example:



Unnamed: 0,Sentence,Sentiment
10460,#CoronavirusPandemic \n\nAs bad as #China's ec...,negative
10461,#China #CoronavirusPandemic \n\nThe economy wa...,negative
10462,"Reliance Industries raises Rs 8,500 crore usin...",positive
10463,Goldman Sachs tells investors to go defensive ...,negative
10464,"#TCS #share price jumps 9% on no #layoffs, #di...",positive
10465,Industry body CII said #discoms are likely to ...,negative
10466,"#Gold prices slip below Rs 46,000 as #investor...",negative
10467,Workers at Bajaj Auto have agreed to a 10% wag...,positive
10468,"#Sharemarket LIVE: Sensex off dayâs high, up...",positive
10469,"#Sensex, #Nifty climb off day's highs, still u...",positive


In [4]:
# Get a random sample of different sentiments
df_positive = df[df["Sentiment"] == "positive"].sample(5000)
df_negative = df[df["Sentiment"] == "negative"]
df_neutral = df[df["Sentiment"] == "neutral"]
df_partial = df_positive.append(df_negative).append(df_neutral)

In [5]:
punctuations = string.punctuation
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def clean_sentence(sentence):
    # Remove digits
    sentence = ''.join([char for char in sentence if not char.isdigit()])
    # Remove html text
    sentence = re.sub('<[^<]+?>', '', sentence)
    
    return sentence

def spacy_tokenizer(sentence):
    
    # tokenize
    tokens = nlp(sentence)
    # Get root words and lowercase
    tokens = [word.lemma_.lower().strip() for word in tokens]
    # Remove stop words and punctuations
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    
    return tokens
    

In [6]:
#Split dataset randomly for 80% training and 20% test data
train_sentences, test_sentences, train_tags, test_tags = train_test_split(df_partial["Sentence"],
                                                                      df_partial["Sentiment"],
                                                                      test_size=0.2, 
                                                                      random_state=3,
                                                                      stratify=df_partial["Sentiment"])
# For training
train_tags = train_tags.to_numpy()
train_sentences = train_sentences.apply(clean_sentence)
train_sentences = train_sentences.to_numpy()
# For testing
test_tags = test_tags.to_numpy()
test_sentences = test_sentences.apply(clean_sentence)
test_sentences = test_sentences.to_numpy()

In [7]:
with open("datasets/positive-words.txt", encoding = "ISO-8859-1") as f:
    pos_words = f.readlines()
pos_words = [p[0:len(p)-1] for p in pos_words if p[0].isalpha()]

# print the first 50 words
print('Positive words:','\n')
print(pos_words[:20], '\n')

with open("datasets/negative-words.txt", encoding = "ISO-8859-1") as f:
    neg_words = f.readlines()
neg_words = [p[0:len(p)-1] for p in neg_words if p[0].isalpha()] 

# Print the first 50 negative words
print('Negative words:','\n')
print(neg_words[:20])

Positive words: 

['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation', 'accolade', 'accolades', 'accommodative', 'accomodative', 'accomplish', 'accomplished', 'accomplishment', 'accomplishments', 'accurate', 'accurately'] 

Negative words: 

['abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted', 'aborts', 'abrade', 'abrasive', 'abrupt', 'abruptly', 'abscond', 'absence', 'absent-minded', 'absentee', 'absurd', 'absurdity', 'absurdly']


In [8]:
print(train_sentences.shape)
print(test_sentences.shape)

(6856,)
(1714,)


In [9]:
# The CountVectorizer builds a dictionary of all words (count_vect.vocabulary_), 
# and generates a matrix (train_counts), to represent each sentence
# as a set of indices into the dictionary. The words in the dictionary are the words found in train_sentences.

count_vect = CountVectorizer(tokenizer = spacy_tokenizer)
train_counts = count_vect.fit_transform(train_sentences)

In [10]:
# Training the model
clf = MultinomialNB().fit(train_counts, train_tags) 

In [11]:
# Testing on training set
predicted = clf.predict(train_counts)
correct = 0
for tag, pred in zip(train_tags, predicted):   # zip allows to go through two lists simultaneously
    if (tag == pred):
        correct += 1
print("Correctly classified %s total training examples out of %s examples" %(correct, train_tags.size))

Correctly classified 6136 total training examples out of 6856 examples


In [12]:
# Testing on test set
test_counts = count_vect.transform(test_sentences)
test_predicted = clf.predict(test_counts)
correct = 0
for tag, pred in zip(test_tags, test_predicted):
    if (tag == pred):
        correct += 1
print("Correctly classified %s total training examples out of %s examples" %(correct, test_tags.size))

Correctly classified 1369 total training examples out of 1714 examples


In [13]:
# Function for recall 
#
#           TPositive
#     -----------------------
#      TPositive + FNegative

def recall(actual_tags, predictions, class_of_interest):
    true_positive_count, false_negative_count = 0, 0
    for i in range(len(actual_tags)):
        if (actual_tags[i] == class_of_interest):
            if (actual_tags[i] == predictions[i]):
                true_positive_count += 1
            else:
                false_negative_count += 1
    return (true_positive_count / (true_positive_count + false_negative_count))

# Function for precision
#
#           TPositive
#     -----------------------
#      TPositive + FPositive

def precision(actual_tags, predictions, class_of_interest):
    true_positive_count, false_positive_count = 0, 0
    for i in range(len(actual_tags)):
        if (actual_tags[i] == class_of_interest and actual_tags[i] == predictions[i]):
            true_positive_count += 1
        elif (actual_tags[i] != class_of_interest and predictions[i] == class_of_interest):
            false_positive_count += 1
    return (true_positive_count / (true_positive_count + false_positive_count))

# Function for f score
#
#           Precision x Recall
#      2 x ---------------------
#           Precision + Recall

def f_score(precision_score, recall_score):
    return ((2 * precision_score * recall_score) / (precision_score + recall_score))

# Test recall of model
print('Recall:\n')

p_recall = recall(test_tags, test_predicted, "positive")
neg_recall = recall(test_tags, test_predicted, "negative")
#neu_recall = recall(test_tags, test_predicted, "neutral")

print('Positive: ', p_recall)
print('Negative: ', neg_recall)
#print('Neutral: ', neu_recall)
print('\n')

# Test precision of model
print('Precision:\n')

p_precision = precision(test_tags, test_predicted, "positive")
neg_precision = precision(test_tags, test_predicted, "negative")
#neu_precision = precision(test_tags, test_predicted, "neutral")

print('Positive: ', p_precision)
print('Negative: ', neg_precision)
#print('Neutral: ', neu_precision)
print('\n')

# Test f score of model
print('F score:\n')
print('Positive: ', f_score(p_precision, p_recall))
print('Negative: ', f_score(neg_precision, neg_recall))
#print('Neutral: ', f_score(neu_precision, neu_recall))


Recall:

Positive:  0.828
Negative:  0.757703081232493


Precision:

Positive:  0.8271728271728271
Negative:  0.758765778401122


F score:

Positive:  0.8275862068965518
Negative:  0.7582340574632095


In [15]:
# Test example
sentences_test = '(Bloomberg) -- The labor group trying to organize Apple Inc. employees at an Atlanta store is withdrawing its request for an election, citing what it alleges are illegal union-busting tactics by the company.Most Read from BloombergFord Beats Tesla to the Punch With First Electric F-150 DeliveryRussian Wins in Eastern Ukraine Spark Debate Over Course of WarStocks Notch Their Best Week Since November 2020: Markets WrapZilingo’s Fired CEO Responds to Questions of Mystery PaymentsUkraine Latest: US Slams Putin’s Food-for-Sanctions Relief PlanThe Communications Workers of America said it took the step “because Apple’s repeated violations of the National Labor Relations Act have made a free and fair election impossible,” according to an emailed statement Friday. The labor group also cited Covid-19 infections among staff at the store, located at the city’s Cumberland Mall, which it said “have raised concerns about the ability of eligible employees to vote and the safety of in-person voting.”“Apple has conducted a systematic, sophisticated campaign to intimidate them and interfere with their right to form a union,” the CWA said. Under NLRB rules, a union’s choice to withdraw from an election generally means the vote is canceled and the union would have to wait at least six months before petitioning again to represent the same group of workers.The news represents a setback for the nascent efforts by several U.S. unions to organize Apple’s retail stores. In addition to the push in Georgia, workers at stores in New York, Maryland and Kentucky have announced campaigns. The CWA -- a group that won elections this year among Verizon Communications Inc. retail employees, Activision Blizzard Inc. quality-assurance testers and subcontracted Google Fiber staff -- has said it’s been hearing from numerous Apple workers around the country.When asked about the move, Apple said it was “fortunate to have incredible retail team members and we deeply value everything they bring to Apple.”“We are pleased to offer very strong compensation and benefits for full-time and part-time employees, including health care, tuition reimbursement, new parental leave, paid family leave, annual stock grants and many other benefits,” the Cupertino, California-based company said.In complaints filed last week with the National Labor Relations Board, the CWA accused Apple of violating federal labor law by forcing workers in Atlanta and New York City to attend “captive audience” meetings about unionization.Existing precedent allows companies to hold such meetings, but the labor board’s current general counsel, Jennifer Abruzzo, views them as inherently coercive and illegal. And she’s pursuing cases that could change the precedent.Abruzzo, a former CWA attorney, is also trying to resurrect an old doctrine requiring employers to negotiate with a labor group if they have no “good faith doubt” that most employees support the union.In its statement Friday, the CWA said that it had the support of an “overwhelming majority” of the Atlanta store’s workers when it petitioned in April for an election.In the weeks since employees announced their organizing efforts, Apple has moved to boost its pay and warned of potential negative consequences from unionization. In a recent video message, Apple retail chief Deirdre O’Brien told employees, “We have a relationship that is based on an open and collaborative and direct engagement.” She said she worried about “what it would mean to put another organization in the middle of our relationship.”On Wednesday, the company told employees it would hike the minimum pay for its retail staff to $22.The Apple store’s union organizing committee vowed to press on. “We’re going to reset and strengthen our union,” according to an email sent to workers Friday. “We can share our experience with other stores to help them really prepare for what’s coming their way.”The Atlanta workers had been slated to vote June 2 through June 4 in what would have been the first NLRB election at an Apple store. Major unions have sometimes been wary of such elections, because of the leeway federal law provides companies to aggressively campaign against organizing. But in recent months, unions pulled off stunning wins at an Amazon.com Inc. warehouse and dozens of Starbucks Corp. cafes across the country, emboldening workers and organizers elsewhere.“We are dedicated to our work and to supporting each other,” Atlanta employee Derrick Bowles said in a statement shared by the union. “We are on this journey together. We want to create a truly democratic union that aligns with Apple’s public values.”(Updates with union email in 13th paragraph.)Most Read from Bloomberg BusinessweekThe Tech Rout Isn’t Just Cyclical—It’s Well-Earned, and OverdueA New Prediction Market Lets Investors Bet Big on Almost AnythingA Startup Wants to Rescue You From Browser Tab HellElon Musk’s 420-Degree Edgelord Pivot Is Getting StaleMark Zuckerberg Is Blowing Up Instagram to Try and Catch TikTok©2022 Bloomberg L.P.'
sentences_test = pd.Series(sentences_test)
sentences_test = sentences_test.to_numpy()
sentences_test = count_vect.transform(sentences_test)
sentences_test = clf.predict(sentences_test)
print(sentences_test)

['negative']


In [16]:
import pickle
file_name = 'model.pkl'
with open(file_name, 'wb') as output_file:
    pickle.dump((clf, count_vect), output_file)

In [17]:
with open('model.pkl', 'rb') as f:
    clf_test, count_vect_test = pickle.load(f)
sentences_test = '(Bloomberg) -- The labor group trying to organize Apple Inc. employees at an Atlanta store is withdrawing its request for an election, citing what it alleges are illegal union-busting tactics by the company.Most Read from BloombergFord Beats Tesla to the Punch With First Electric F-150 DeliveryRussian Wins in Eastern Ukraine Spark Debate Over Course of WarStocks Notch Their Best Week Since November 2020: Markets WrapZilingo’s Fired CEO Responds to Questions of Mystery PaymentsUkraine Latest: US Slams Putin’s Food-for-Sanctions Relief PlanThe Communications Workers of America said it took the step “because Apple’s repeated violations of the National Labor Relations Act have made a free and fair election impossible,” according to an emailed statement Friday. The labor group also cited Covid-19 infections among staff at the store, located at the city’s Cumberland Mall, which it said “have raised concerns about the ability of eligible employees to vote and the safety of in-person voting.”“Apple has conducted a systematic, sophisticated campaign to intimidate them and interfere with their right to form a union,” the CWA said. Under NLRB rules, a union’s choice to withdraw from an election generally means the vote is canceled and the union would have to wait at least six months before petitioning again to represent the same group of workers.The news represents a setback for the nascent efforts by several U.S. unions to organize Apple’s retail stores. In addition to the push in Georgia, workers at stores in New York, Maryland and Kentucky have announced campaigns. The CWA -- a group that won elections this year among Verizon Communications Inc. retail employees, Activision Blizzard Inc. quality-assurance testers and subcontracted Google Fiber staff -- has said it’s been hearing from numerous Apple workers around the country.When asked about the move, Apple said it was “fortunate to have incredible retail team members and we deeply value everything they bring to Apple.”“We are pleased to offer very strong compensation and benefits for full-time and part-time employees, including health care, tuition reimbursement, new parental leave, paid family leave, annual stock grants and many other benefits,” the Cupertino, California-based company said.In complaints filed last week with the National Labor Relations Board, the CWA accused Apple of violating federal labor law by forcing workers in Atlanta and New York City to attend “captive audience” meetings about unionization.Existing precedent allows companies to hold such meetings, but the labor board’s current general counsel, Jennifer Abruzzo, views them as inherently coercive and illegal. And she’s pursuing cases that could change the precedent.Abruzzo, a former CWA attorney, is also trying to resurrect an old doctrine requiring employers to negotiate with a labor group if they have no “good faith doubt” that most employees support the union.In its statement Friday, the CWA said that it had the support of an “overwhelming majority” of the Atlanta store’s workers when it petitioned in April for an election.In the weeks since employees announced their organizing efforts, Apple has moved to boost its pay and warned of potential negative consequences from unionization. In a recent video message, Apple retail chief Deirdre O’Brien told employees, “We have a relationship that is based on an open and collaborative and direct engagement.” She said she worried about “what it would mean to put another organization in the middle of our relationship.”On Wednesday, the company told employees it would hike the minimum pay for its retail staff to $22.The Apple store’s union organizing committee vowed to press on. “We’re going to reset and strengthen our union,” according to an email sent to workers Friday. “We can share our experience with other stores to help them really prepare for what’s coming their way.”The Atlanta workers had been slated to vote June 2 through June 4 in what would have been the first NLRB election at an Apple store. Major unions have sometimes been wary of such elections, because of the leeway federal law provides companies to aggressively campaign against organizing. But in recent months, unions pulled off stunning wins at an Amazon.com Inc. warehouse and dozens of Starbucks Corp. cafes across the country, emboldening workers and organizers elsewhere.“We are dedicated to our work and to supporting each other,” Atlanta employee Derrick Bowles said in a statement shared by the union. “We are on this journey together. We want to create a truly democratic union that aligns with Apple’s public values.”(Updates with union email in 13th paragraph.)Most Read from Bloomberg BusinessweekThe Tech Rout Isn’t Just Cyclical—It’s Well-Earned, and OverdueA New Prediction Market Lets Investors Bet Big on Almost AnythingA Startup Wants to Rescue You From Browser Tab HellElon Musk’s 420-Degree Edgelord Pivot Is Getting StaleMark Zuckerberg Is Blowing Up Instagram to Try and Catch TikTok©2022 Bloomberg L.P.'
sentences_test = pd.Series(sentences_test)
sentences_test = sentences_test.to_numpy()
sentences_test = count_vect_test.transform(sentences_test)
sentences_test = clf_test.predict(sentences_test)
print(sentences_test)

['negative']
