Applicable for any earnings transcript or news/press release ".txt"
- this example uses a recent press release from the SOFI website and most recent 10k
- returns a sentiment score of a given text file and document tone

In [7]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import lxml
import re
from collections import Counter
import nltk
import string
import matplotlib.pyplot as plt
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

Press Release Text Analysis

In [8]:
with open("Sofi News.txt", encoding="utf-8") as file:
    sofinews = file.read()

In [9]:
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords=stopwords.union(set(string.punctuation))

lemma = nltk.WordNetLemmatizer()
sofiwords = nltk.tokenize.word_tokenize(sofinews.lower())
sofi_nswords = [word for word in sofiwords if not(word in stopwords) and word.isalpha()]
sofi_lemmawords = [lemma.lemmatize(word) for word in sofi_nswords]

sofi_lemmawords

['san',
 'business',
 'wire',
 'technology',
 'nasdaq',
 'sofi',
 'shop',
 'digital',
 'financial',
 'service',
 'today',
 'announced',
 'launch',
 'sofi',
 'crypto',
 'becoming',
 'first',
 'nationally',
 'chartered',
 'bank',
 'consumer',
 'bank',
 'borrow',
 'invest',
 'buy',
 'sell',
 'hold',
 'crypto',
 'one',
 'place',
 'platform',
 'trust',
 'sofi',
 'crypto',
 'member',
 'able',
 'buy',
 'sell',
 'hold',
 'dozen',
 'cryptocurrencies',
 'including',
 'bitcoin',
 'btc',
 'ethereum',
 'eth',
 'solana',
 'sol',
 'platform',
 'built',
 'experienced',
 'crypto',
 'investor',
 'user',
 'sofi',
 'crypto',
 'phased',
 'rollout',
 'start',
 'today',
 'become',
 'available',
 'member',
 'coming',
 'week',
 'today',
 'mark',
 'pivotal',
 'moment',
 'banking',
 'meet',
 'crypto',
 'one',
 'app',
 'trusted',
 'platform',
 'driven',
 'core',
 'mission',
 'help',
 'member',
 'get',
 'money',
 'right',
 'said',
 'anthony',
 'noto',
 'ceo',
 'sofi',
 'believe',
 'blockchain',
 'technology',
 'fu

In [10]:
sofifreq = nltk.FreqDist(sofi_lemmawords)
sofifreq.most_common()

[('sofi', 35),
 ('crypto', 29),
 ('bank', 13),
 ('member', 13),
 ('statement', 12),
 ('financial', 11),
 ('money', 10),
 ('may', 10),
 ('investor', 9),
 ('security', 9),
 ('digital', 8),
 ('new', 8),
 ('future', 8),
 ('asset', 8),
 ('information', 8),
 ('product', 8),
 ('website', 8),
 ('service', 7),
 ('platform', 7),
 ('buy', 6),
 ('one', 6),
 ('account', 6),
 ('channel', 6),
 ('ability', 6),
 ('technology', 5),
 ('launch', 5),
 ('including', 5),
 ('get', 5),
 ('blockchain', 5),
 ('risk', 5),
 ('value', 5),
 ('factor', 5),
 ('business', 4),
 ('invest', 4),
 ('sell', 4),
 ('hold', 4),
 ('app', 4),
 ('change', 4),
 ('people', 4),
 ('regulated', 4),
 ('fdic', 4),
 ('insured', 4),
 ('manage', 4),
 ('demand', 4),
 ('exchange', 4),
 ('cryptocurrency', 4),
 ('result', 4),
 ('fact', 4),
 ('filing', 4),
 ('regarding', 4),
 ('expectation', 4),
 ('today', 3),
 ('consumer', 3),
 ('borrow', 3),
 ('built', 3),
 ('meet', 3),
 ('help', 3),
 ('way', 3),
 ('faster', 3),
 ('better', 3),
 ('operation', 

In [11]:
#Analyzing each sentence as opposed to each word
sofi_sentences = nltk.tokenize.sent_tokenize(sofinews)

sentiment = SentimentIntensityAnalyzer()

sofiscore = sentiment.polarity_scores(sofinews)
sofiscore

{'neg': 0.026, 'neu': 0.873, 'pos': 0.101, 'compound': 0.9983}

In [12]:
netpos = []
compscore = []
for s in sofi_sentences:
    netpos.append(sentiment.polarity_scores(s)['pos'] - sentiment.polarity_scores(s)['neg'])
    compscore.append(sentiment.polarity_scores(s)['compound'])

np.array(netpos).mean()

np.float64(0.076)

In [13]:
np.array(compscore).mean()

np.float64(0.27435)

SOFI 10k Text Analysis

In [20]:
with open("SOFI 2024 10k.txt", encoding="utf-8", errors="ignore") as file:
    sofitext=file.read().lower()
souptext=BeautifulSoup(sofitext).text #removes all the html tags

sofiwords=nltk.tokenize.word_tokenize(souptext)
sofi_nswords=[word for word in sofiwords if not(word in stopwords) and word.isalpha()]
sofi_lemmawords=[lemma.lemmatize(word) for word in sofi_nswords]

In [21]:
with open('Positive.txt') as file:
    Positive=file.read().lower()
with open('Negative.txt') as file:
    Negative=file.read().lower()

positive_bag=nltk.tokenize.word_tokenize(Positive)
negative_bag=nltk.tokenize.word_tokenize(Negative)

In [25]:
#Make a list of all the positive words
sofi_poswords=[word for word in sofi_lemmawords if word in positive_bag]
#Make a list of all the negative words
sofi_negwords=[word for word in sofi_lemmawords if word in negative_bag]

In [23]:
#document tone
pctpos=len(sofi_poswords)/len(sofi_lemmawords)
pctneg=len(sofi_negwords)/len(sofi_lemmawords)

tone = pctpos-pctneg
tone

-0.02622996162312293