In [3]:
import pandas as pd
import flair


In [5]:
model = flair.models.TextClassifier.load('en-sentiment')

2023-02-27 10:54:35,653 loading file /Users/saramaras/.flair/models/sentiment-en-mix-distillbert_4.pt


In [11]:
def get_sentiment(text):
    # tokenize input text
    sentence = flair.data.Sentence(text)
    # make sentiment prediction
    model.predict(sentence)
    # extract sentiment direction and confidence (label and score) object
    sentiment = sentence.labels[0]
    return sentiment

In [8]:
df = pd.read_csv('reddit_investing_ner.csv', sep = '|')
df

Unnamed: 0,name,created_utc,subreddit,selftext,upvote_ratio,ups,downs,score,organizations
0,t3_119u28i,1.677146e+09,investing,Have a general question? Want to offer some c...,0.75,4.0,0.0,4.0,['wiki']
1,t3_119u7zq,1.677147e+09,investing,"Hi everyone, I just created my Roth IRA accoun...",0.89,132.0,0.0,132.0,[]
2,t3_1191h6s,1.677077e+09,investing,https://www.barrons.com/articles/intel-cuts-di...,0.96,1325.0,0.0,1325.0,"['AMD', 'FF', 'Intel', 'gt;Intel']"
3,t3_119ibx7,1.677110e+09,investing,a) Results were in line with revised expectati...,0.89,184.0,0.0,184.0,"['AI', 'Nvidia', 'here.](https://www.reddit.com']"
4,t3_119yxek,1.677162e+09,investing,"I don’t know about you, but I’m sick and tired...",0.62,9.0,0.0,9.0,[]
...,...,...,...,...,...,...,...,...,...
495,t3_1118j6p,1.676296e+09,investing,"Hello,\n\nEvery month I put an equal amount in...",0.52,1.0,0.0,1.0,"['VanEck', 'IWDA', 'Vanguard', 'iShares', 'All..."
496,t3_10zzpsp,1.676154e+09,investing,I went with just the first quarter because IMH...,0.89,783.0,0.0,783.0,[]
497,t3_1114xqh,1.676283e+09,investing,"Hi everyone, I have a quick question. \n\nIf s...",0.56,1.0,0.0,1.0,[]
498,t3_111c2a9,1.676303e+09,investing,You've probably all see then the line chart sh...,0.24,0.0,0.0,0.0,[]


In [12]:
# get sentiment
df['sentiment'] = df['selftext'].apply(get_sentiment)

In [13]:
df.head()

Unnamed: 0,name,created_utc,subreddit,selftext,upvote_ratio,ups,downs,score,organizations,sentiment
0,t3_119u28i,1677146000.0,investing,Have a general question? Want to offer some c...,0.75,4.0,0.0,4.0,['wiki'],"Sentence: ""Have a general question ? Want to o..."
1,t3_119u7zq,1677147000.0,investing,"Hi everyone, I just created my Roth IRA accoun...",0.89,132.0,0.0,132.0,[],"Sentence: ""Hi everyone , I just created my Rot..."
2,t3_1191h6s,1677077000.0,investing,https://www.barrons.com/articles/intel-cuts-di...,0.96,1325.0,0.0,1325.0,"['AMD', 'FF', 'Intel', 'gt;Intel']","Sentence: ""https :// www.barrons.com / article..."
3,t3_119ibx7,1677110000.0,investing,a) Results were in line with revised expectati...,0.89,184.0,0.0,184.0,"['AI', 'Nvidia', 'here.](https://www.reddit.com']","Sentence: ""a ) Results were in line with revis..."
4,t3_119yxek,1677162000.0,investing,"I don’t know about you, but I’m sick and tired...",0.62,9.0,0.0,9.0,[],"Sentence: ""I do n’t know about you , but I ’m ..."


In [14]:
import ast
#
df['organizations'] = df['organizations'].apply(lambda x: ast.literal_eval(x))


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           500 non-null    object 
 1   created_utc    500 non-null    float64
 2   subreddit      500 non-null    object 
 3   selftext       500 non-null    object 
 4   upvote_ratio   500 non-null    float64
 5   ups            500 non-null    float64
 6   downs          500 non-null    float64
 7   score          500 non-null    float64
 8   organizations  500 non-null    object 
 9   sentiment      500 non-null    object 
dtypes: float64(5), object(5)
memory usage: 39.2+ KB


In [15]:
# initialize sentiment dictionary
sentiment = {}

# loop through dataframe and extract org labels and sentiment scores into sentiment dictionary
for i, row in df.iterrows():
    # extract sentiment direction and score
    direction = row['sentiment'].value
    score = row['sentiment'].score
    # loop through each label in organizations column
    for org in row['organizations']:
        # check if org label exists in sentiment dictionary already
        if org not in sentiment.keys():
            # if it doesn't, initialize new entry in dictionary
            sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        # append positive/negative score to respective dictionary entry
        sentiment[org][direction].append(score)

In [24]:
sentiment['APY']

{'POSITIVE': 0.0, 'NEGATIVE': 4.999622702598572}

In [17]:

# initialize sentiment list
avg_sentiment = []

# loop through each organization
for org in sentiment.keys():
    # get number of positive and negative ratings
    freq = len(sentiment[org]['POSITIVE']) + len(sentiment[org]['NEGATIVE'])
    for direction in ['POSITIVE', 'NEGATIVE']:
        # assign to variable for cleaner code
        score = sentiment[org][direction]
        # if there are no entries, set to 0
        if len(score) == 0:
            sentiment[org][direction] = 0.0
        else:
            # otherwise calculate total
            sentiment[org][direction] = sum(score)
    # now calculate total amount
    total = sentiment[org]['POSITIVE'] - sentiment[org]['NEGATIVE']
    # and the average score
    avg = total/freq
    # add to sentiment list
    avg_sentiment.append({
        'entity': org,
        'positive': sentiment[org]['POSITIVE'],
        'negative': sentiment[org]['NEGATIVE'],
        'frequency': freq,
        'score': avg
    })

In [18]:
sentiment_df = pd.DataFrame(avg_sentiment)
sentiment_df.head()

Unnamed: 0,entity,positive,negative,frequency,score
0,wiki,0.0,28.960026,29,-0.998622
1,AMD,0.0,0.995498,1,-0.995498
2,FF,0.0,0.995498,1,-0.995498
3,Intel,0.0,0.995498,1,-0.995498
4,gt;Intel,0.0,0.995498,1,-0.995498


In [19]:
sentiment_df = sentiment_df[sentiment_df['frequency'] > 3]
sentiment_df

Unnamed: 0,entity,positive,negative,frequency,score
0,wiki,0.000000,28.960026,29,-0.998622
5,AI,7.483507,5.717908,14,0.126114
6,Nvidia,3.510059,0.736639,5,0.554684
8,Vanguard,2.018482,13.625739,19,-0.610908
17,Walmart,0.000000,3.974959,5,-0.794992
...,...,...,...,...,...
220,Bard,0.000000,3.999737,4,-0.999934
221,VanEck,2.018482,0.000000,4,0.504621
222,IWDA,2.018482,0.000000,4,0.504621
223,iShares,2.018482,0.000000,4,0.504621


In [21]:
sentiment_df.sort_values('score', ascending=False).head(20)

Unnamed: 0,entity,positive,negative,frequency,score
192,nvidia-microsoft,3.510059,0.0,4,0.877515
202,Tesla,3.510059,0.0,4,0.877515
201,AI /,3.510059,0.0,4,0.877515
190,IBM,3.510059,0.0,4,0.877515
200,8B,3.510059,0.0,4,0.877515
198,"Baidu, Tencent",3.510059,0.0,4,0.877515
196,Google Bard,3.510059,0.0,4,0.877515
191,Bing Chat,3.510059,0.0,4,0.877515
189,Jefferies,3.510059,0.0,4,0.877515
195,Lincoln,3.510059,0.0,4,0.877515
