In [None]:
import pandas as pd
import nltk
import vaderSentiment
import re
import string

from collections import defaultdict
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')

Read Data from CSV File:

In [None]:
path = r'C:\Users\JZ\Desktop\Data\JPMC_data_csv1.csv'

In [None]:
def load():
    pd.set_option("display.max_colwidth", None) #-1
    return pd.read_csv(path)

In [None]:
def df_style(val):
    return "font-weight: normal"

In [None]:
totalInfo = load().copy()

In [None]:
totalInfo.style.applymap(df_style)

In [None]:
#fill any NaN/invalid values
totalInfo = totalInfo.fillna(0)

In [None]:
#get rid of any noisy data (compound score is zero)
filter_noise = (totalInfo['compound score '] != '0')
totalInfo = totalInfo.loc[filter_noise]

In [None]:
# Filter special characters
def filter_special_chars(comment):
    new_sentence = '' #empty string
    new_comment = re.sub(r"https:\S+", "", comment) #remove url links from comment

    for i in range(0, len(new_comment)):
        if ord(new_comment[i]) >= 110000: #for characters like 𝐴𝑀𝐶 BB  𝑁𝑂𝐾 
             new_sentence += new_comment[i]
        if ord(new_comment[i]) <= 127: #keep all ascii characters
            new_sentence += new_comment[i]
                
    return new_sentence

In [None]:
#Notes:              chr(119886 - 119911) = 𝑎 - 𝑧 
#                    chr(119860 - 119885) = 𝐴 - 𝑍
#                    chr(119912 - 119937) = 𝑨 - 𝒁

#REGULAR chars       chr(97 - 122) = a - z
#                    chr(65 - 90)  = A- Z

Find if sentence has ... in it

In [None]:
def remove_sentence(comment): #if text contains …, then it will be removed (… means truncated)
    obj = re.search('…', comment)
    if obj != None:
        return 1
    else:
        return -1
        

In [None]:
totalInfo['remove'] = totalInfo['text'].map(lambda x: '1' if remove_sentence(x) == 1 else '-1')

Filter out sentences that are truncated:

In [None]:
filter_truncated = totalInfo['remove'] == '-1'
full_comments_df = totalInfo.loc[filter_truncated]
full_comments_df

Filter out special characters for full comments:

In [None]:
full_comments_df['new_text'] = full_comments_df['text'].map(lambda x: filter_special_chars(x)) # remove special char

In [None]:
full_comments_df = full_comments_df[['Date','text','new_text','favorited', 'retweeted', 'lang', 'quoted_status', 'Stock_Name', 'sentiment', 'sentiment_class', 'compound score ', 'Stock Price', 'remove']]

In [None]:
full_comments_df

In [None]:
full_comments_df[['text','new_text']]

Modifed vaderSentiment Analyzer:

In [None]:
stocks = totalInfo['Stock_Name'].to_numpy()
stocks

all_stocks = set(stocks)
all_stocks.remove('Stock_Name')

all_stocks

In [None]:
def get_sentiment(comment):
    sentence_list = []
    stock_sentiment_dict = {}
    all_stocks_mentioned = defaultdict(list) # ex: GME:[1, 0.68]
                                             # 1 <-- number of times it appeared in comment
                                             # 0.68 <-- sum of compound sentimenent scores
                                             # Later find average of compound scores ex: 0.68/1 = 0.68 (average)
    avg_score = 0
    within_comment = nltk.sent_tokenize(comment) #makes a list of all sentences in comment
    
    for sentence in within_comment:
        stocks_seen = set() #within each sentence, keeps track of what stocks are seen
        analyzer = SentimentIntensityAnalyzer() #Create analyzer object
        vs = analyzer.polarity_scores(sentence) # sentiment of sentence
        words = nltk.word_tokenize(sentence) # makes a list of all the words within a sentence
        
        for word in words: #iterates through each word in sentence
            if word in all_stocks: #checks if stock name is a word
                if word not in all_stocks_mentioned: #adds word to dictionary if never seen before
                    if word not in stocks_seen:
                        stock_sentiment_dict[word] = 0
                        
                        stocks_seen.add(word)
                        all_stocks_mentioned[word].append(1)
                        all_stocks_mentioned[word].append(vs['compound'])
                
                else: #if word is already in dictionary (seen before)
                    if word not in stocks_seen:
                        all_stocks_mentioned[word][0] += 1
                        all_stocks_mentioned[word][1] += vs['compound']
            
    #calculate average compound score for each stock
    for name in all_stocks_mentioned.keys():
        stock_sentiment_dict[name] = all_stocks_mentioned[name][1] / all_stocks_mentioned[name][0]
    
    #put average stock sentiment in dictionary
    #stock_sentiment_dict[word] = avg_score
    print(all_stocks_mentioned)
    return stock_sentiment_dict

In [None]:
"""### Sentiment Analyzer"""

def company_scores(sentence): #returns a dictionary with company names in interest with compound score 
    sentiment_dict = get_sentiment(sentence)
    return sentiment_dict


def apply_vadersentiment(df):
    df['modified']=df['new_text'].apply(lambda x: company_scores(x))
    #df['sentiment_class']=df['sentiment'].apply(lambda x: 'positive' if x['compound']>0.5 else ('negative' if x['compound']<-0.5 else 'neutral'))
    #df['compound score']=df['sentiment'].apply(lambda x: x['compound'])
    return df

Obtain all stocks that we are interested in:

In [None]:
apply_vadersentiment(full_comments_df)