# **Sentiment Analysis on News/Tweets related to Stock AAPL**

# 1. Import Dependancies

In [2]:
import pandas as pd 
import string 

# Importing Natural Language Processing toolkit 
import nltk

# Downloading the NLTK english stop words
nltk.download('stopwords')

# Downloading the NLTK sentence tokenizer
nltk.download('punkt')

# Downloading the NLTK POS Tagger
nltk.download('averaged_perceptron_tagger')

# Downloading the NLTK Vader Lexicon
nltk.download('vader_lexicon')

# Importing the NLTK english stop words 
from nltk.corpus import stopwords

# Importing frequency distribution from NLTK
from nltk.probability import FreqDist

# Importing VADER dictionary. It is a rule-based sentiment analyzer
from nltk.sentiment import SentimentIntensityAnalyzer

# Importing data visualization modules 
from wordcloud import WordCloud
import plotly.express as px 
import matplotlib.pyplot as plt

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>
[nltk_data] Error loading vader_lexicon: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


# 2. Load Dataset

In [3]:
data_news = pd.read_csv("./data/AAPL_news.csv")

In [4]:
data_news.head()

Unnamed: 0,Publisher,Complete_Title,Abstract,Website,Time,Stock,Exact_Time
0,The Motley Fool,"Apple Stock: Buy, Sell, or Hold After Soaring ...",The tech giant is now worth nearly $2.3 trilli...,https://www.fool.com/investing/2021/01/01/appl...,1/1/2021,AAPL,10:05:00
1,Business Insider,Apple Watch SE favorite purchase of 2020: fitn...,I bought the new Apple Watch SE this fall. ?? ...,https://www.businessinsider.com/apple-watch-se...,1/1/2021,AAPL,8:12:00
2,Forbes,Apple Fitness Plus: 7 Reasons It Should Be You...,Specifically an iPhone and an Apple Watch. You...,https://www.forbes.com/sites/davidphelan/2021/...,1/2/2021,AAPL,7:00:00
3,Forbes,"Huawei Fallout?????Google, Samsung And Apple F...","Put simply, the downside risks from America bl...",https://www.forbes.com/sites/zakdoffman/2021/0...,1/2/2021,AAPL,18:50:00
4,The Motley Fool,Bold Prediction: General Motors Is the Next Ap...,Apple has had an incredible turnaround over th...,https://www.fool.com/investing/2021/01/02/bold...,1/2/2021,AAPL,11:54:00


In [5]:
data_news.shape

(2139, 7)

In [6]:
data_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Publisher       2139 non-null   object
 1   Complete_Title  2139 non-null   object
 2   Abstract        2139 non-null   object
 3   Website         2139 non-null   object
 4   Time            2139 non-null   object
 5   Stock           2139 non-null   object
 6   Exact_Time      2139 non-null   object
dtypes: object(7)
memory usage: 117.1+ KB


In [7]:
data_tweets = pd.read_csv("./data/AAPL_twitter.csv")

In [8]:
data_tweets.head()

Unnamed: 0,date,time,tweet,retweets_count,likes_count,LEN
0,1/1/2021,23:41:18,@peregreine Price target for Apple for Jan/feb...,0,0,3
1,1/1/2021,23:11:12,"$AAPL looks good if it can clear 133 for 135, ...",0,4,1
2,1/1/2021,23:09:40,@tim_cook I ordered an Apple MacBook Air two w...,0,0,1
3,1/1/2021,22:55:00,Apple cost short sellers $7 billion this yr. $...,2,54,2
4,1/1/2021,22:37:39,Will $Aapl hit $250 by 2022,0,0,1


In [9]:
data_tweets.shape

(114653, 6)

In [10]:
data_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114653 entries, 0 to 114652
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   date            114653 non-null  object
 1   time            114653 non-null  object
 2   tweet           114653 non-null  object
 3   retweets_count  114653 non-null  int64 
 4   likes_count     114653 non-null  int64 
 5   LEN             114653 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 5.2+ MB


# 3. Explore Dataset

#### News

In [11]:
data_news.Complete_Title[:5]

0    Apple Stock: Buy, Sell, or Hold After Soaring ...
1    Apple Watch SE favorite purchase of 2020: fitn...
2    Apple Fitness Plus: 7 Reasons It Should Be You...
3    Huawei Fallout?????Google, Samsung And Apple F...
4    Bold Prediction: General Motors Is the Next Ap...
Name: Complete_Title, dtype: object

In [12]:
data_news.Abstract[0]

'The tech giant is now worth nearly $2.3 trillion. 2020 was an astounding year for tech investors. The tech-heavy Nasdaq Composite jumped 43% over this timeframe...'

In [13]:
data_news.Stock.unique()

array(['AAPL'], dtype=object)

In [14]:
data_news['Time']  = pd.to_datetime(data_news['Time'])

In [15]:
data_news

Unnamed: 0,Publisher,Complete_Title,Abstract,Website,Time,Stock,Exact_Time
0,The Motley Fool,"Apple Stock: Buy, Sell, or Hold After Soaring ...",The tech giant is now worth nearly $2.3 trilli...,https://www.fool.com/investing/2021/01/01/appl...,2021-01-01,AAPL,10:05:00
1,Business Insider,Apple Watch SE favorite purchase of 2020: fitn...,I bought the new Apple Watch SE this fall. ?? ...,https://www.businessinsider.com/apple-watch-se...,2021-01-01,AAPL,8:12:00
2,Forbes,Apple Fitness Plus: 7 Reasons It Should Be You...,Specifically an iPhone and an Apple Watch. You...,https://www.forbes.com/sites/davidphelan/2021/...,2021-01-02,AAPL,7:00:00
3,Forbes,"Huawei Fallout?????Google, Samsung And Apple F...","Put simply, the downside risks from America bl...",https://www.forbes.com/sites/zakdoffman/2021/0...,2021-01-02,AAPL,18:50:00
4,The Motley Fool,Bold Prediction: General Motors Is the Next Ap...,Apple has had an incredible turnaround over th...,https://www.fool.com/investing/2021/01/02/bold...,2021-01-02,AAPL,11:54:00
...,...,...,...,...,...,...,...
2134,Bloomberg.com,"Predictions for 2022: Trump, Blockchain, Tom B...",There will be more melting of the Greenland ic...,https://www.bloomberg.com/opinion/articles/202...,2021-12-29,AAPL,9:00:00
2135,CNBC,Apple's new fix-it policy is a drop in the buc...,It was an exciting day at iFixit in November 2...,https://www.cnbc.com/2021/12/30/apples-new-fix...,2021-12-30,AAPL,12:01:00
2136,CNBC,Apple's repair policy is a good but small step...,CNBC first spoke with iFixit CEO Kyle Wiens be...,https://www.cnbc.com/video/2021/12/30/apples-r...,2021-12-30,AAPL,12:00:00
2137,Forbes,Apple Reveals Dazzling Future For Apple Watch,"A future Apple Watch, it's been rumored, will ...",https://www.forbes.com/sites/davidphelan/2021/...,2021-12-30,AAPL,14:00:00


In [16]:
# Find date range
min_date_nw = data_news['Time'].min()
max_date_nw = data_news['Time'].max()

In [17]:
# Print date range
print(f"Date range: {min_date_nw} to {max_date_nw}")

Date range: 2021-01-01 00:00:00 to 2021-12-30 00:00:00


#### Tweets

In [18]:
data_tweets.tweet[0]

'@peregreine Price target for Apple for Jan/feb? Trying to decide between $aapl and $amzn when I trim my $tsla gains'

In [19]:
data_tweets['date']  = pd.to_datetime(data_tweets['date'])

In [20]:
# Find date range
min_date_tw = data_tweets['date'].min()
max_date_tw = data_tweets['date'].max()

In [21]:
# Print date range
print(f"Date range: {min_date_tw} to {max_date_tw}")

Date range: 2021-01-01 00:00:00 to 2021-12-30 00:00:00


# 4. Data Pre-Processing

### 4.1 Prepare for Pre-Processing

In [22]:
dn_copy = data_news.copy()

In [23]:
dt_copy = data_tweets.copy()

In [24]:
# Rename the columns
dn_copy.rename(columns={'Time': 'date', 'Abstract': 'data'}, inplace=True)

In [25]:
# Rename the columns
dt_copy.rename(columns={'tweet': 'data'}, inplace=True)

### 4.2 Filter Required Columns

In [26]:
dn_copy = dn_copy[['date', 'data']]

In [27]:
dn_copy.head(3)

Unnamed: 0,date,data
0,2021-01-01,The tech giant is now worth nearly $2.3 trilli...
1,2021-01-01,I bought the new Apple Watch SE this fall. ?? ...
2,2021-01-02,Specifically an iPhone and an Apple Watch. You...


In [28]:
dt_copy = dt_copy[['date', 'data']]

In [29]:
dt_copy.head(3)

Unnamed: 0,date,data
0,2021-01-01,@peregreine Price target for Apple for Jan/feb...
1,2021-01-01,"$AAPL looks good if it can clear 133 for 135, ..."
2,2021-01-01,@tim_cook I ordered an Apple MacBook Air two w...


### 4.2 Lowercasing

In [30]:
def convert_column_to_lowercase(df, column_name):
    """
    Convert the specified column of a DataFrame to lowercase.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the column to be modified.
    column_name (str): The name of the column to convert to lowercase.
    
    Returns:
    pd.DataFrame: The DataFrame with the specified column converted to lowercase.
    """
    df[column_name] = df[column_name].str.lower()
    return df

In [31]:
dn_copy = convert_column_to_lowercase(dn_copy, 'data')

In [32]:
dt_copy = convert_column_to_lowercase(dt_copy, 'data')

In [33]:
dn_copy.head(3)

Unnamed: 0,date,data
0,2021-01-01,the tech giant is now worth nearly $2.3 trilli...
1,2021-01-01,i bought the new apple watch se this fall. ?? ...
2,2021-01-02,specifically an iphone and an apple watch. you...


In [34]:
dt_copy.head(3)

Unnamed: 0,date,data
0,2021-01-01,@peregreine price target for apple for jan/feb...
1,2021-01-01,"$aapl looks good if it can clear 133 for 135, ..."
2,2021-01-01,@tim_cook i ordered an apple macbook air two w...


### 4.4 Removing puctuations

In [35]:
def remove_punctuation(df, column_name):
    """
    Remove punctuation from the specified column of a DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the column with punctuation.
    column_name (str): The name of the column from which punctuation will be removed.
    
    Returns:
    pd.DataFrame: The DataFrame with punctuation removed from the specified column.
    """
    df[column_name] = df[column_name].str.translate(str.maketrans('', '', string.punctuation))
    return df

In [36]:
dn_copy = remove_punctuation(dn_copy, 'data')

In [37]:
dt_copy = remove_punctuation(dt_copy, 'data')

In [38]:
dn_copy.to_csv('./outputs/data_news.csv', index=False)

In [39]:
dt_copy.to_csv('./outputs/data_tweets.csv', index=False)

## APPROACH 1 - Using VADER on Aggrigated Dataset

### 4.5 Merge Datasets

In [40]:
merged_data = pd.concat([dn_copy, dt_copy], ignore_index=True)

In [41]:
merged_data.shape

(116792, 2)

In [42]:
merged_data.head()

Unnamed: 0,date,data
0,2021-01-01,the tech giant is now worth nearly 23 trillion...
1,2021-01-01,i bought the new apple watch se this fall the...
2,2021-01-02,specifically an iphone and an apple watch you ...
3,2021-01-02,put simply the downside risks from america bla...
4,2021-01-02,apple has had an incredible turnaround over th...


In [43]:
merged_data.tail()

Unnamed: 0,date,data
116787,2021-12-30,bigsollu investorinstonk lots of unanswered qu...
116788,2021-12-30,goodiegood tsla dip into earnings then rip aap...
116789,2021-12-30,bulltrendz damn i wish i saw this earlier mi...
116790,2021-12-30,aapl apple to release special beats studio bud...
116791,2021-12-30,aapl way oversold


In [44]:
# Find date range
min_date_md = merged_data['date'].min()
max_date_md = merged_data['date'].max()

In [45]:
# Print date range
print(f"Date range: {min_date_tw} to {max_date_tw}")

Date range: 2021-01-01 00:00:00 to 2021-12-30 00:00:00


### 4.6. Aggregate Data

In [46]:
# Group by 'date' and aggregate 'data' values into a single string separated by spaces
grouped_data = merged_data.groupby('date')['data'].apply(lambda x: ' '.join(x)).reset_index()


In [47]:
grouped_data.data[0]

'the tech giant is now worth nearly 23 trillion 2020 was an astounding year for tech investors the techheavy nasdaq composite jumped 43 over this timeframe i bought the new apple watch se this fall  the smart watch was my most useful purchase of 2020  the fitness tools customization options and audio control peregreine price target for apple for janfeb trying to decide between aapl and amzn when i trim my tsla gains aapl looks good if it can clear 133 for 135 137 timcook i ordered an apple macbook air two weeks ago and still hasnt arrived q4 sales must be amazing aapl apple cost short sellers 7 billion this yr aapl tesla bears lost in excess of 38 billion  tsla  lessons   never bring ur hatred for someone to the stockmarket elonmodi or trump  always have an exit plan irrespective of who u are druckenmiler eisman or einhorn investing will aapl hit 250 by 2022 postmarket aapl long rvvtfm aapl m1 chip is a game changer apple car news in 2021 wagersolution hipstertrader realdonaldtrump aap

### 4.7 Tokenizing

In [48]:
def tokenize_column(df, column_name):
    """
    Tokenize words in the specified column of a DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the column to be tokenized.
    column_name (str): The name of the column to tokenize.
    
    Returns:
    pd.DataFrame: The DataFrame with the tokenized words saved in a new column.
    """
    # Tokenize words in the specified column
    df[column_name + '_tokenized'] = df[column_name].apply(nltk.word_tokenize)
    return df

In [49]:
grouped_data = tokenize_column(grouped_data, 'data')

In [50]:
grouped_data.head(3)

Unnamed: 0,date,data,data_tokenized
0,2021-01-01,the tech giant is now worth nearly 23 trillion...,"[the, tech, giant, is, now, worth, nearly, 23,..."
1,2021-01-02,specifically an iphone and an apple watch you ...,"[specifically, an, iphone, and, an, apple, wat..."
2,2021-01-03,according to apple data linked to you means th...,"[according, to, apple, data, linked, to, you, ..."


### 4.8 Remove Stopwords

In [51]:
en_stopwords = set(nltk.corpus.stopwords.words('english'))

In [52]:
def remove_stopwords(tokens):
    """
    Remove stop words from a list of tokens.
    
    Parameters:
    tokens (list): List of tokens.
    
    Returns:
    list: List of tokens with stop words removed.
    """
    return [t for t in tokens if t not in en_stopwords]

In [53]:
def remove_stopwords_column(df, column_name):
    """
    Remove stop words from the specified column of a DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the column with tokenized words.
    column_name (str): The name of the column containing tokenized words.
    
    Returns:
    pd.DataFrame: The DataFrame with stop words removed and saved in a new column.
    """
    # Remove stop words from the specified column
    df['cleaned_tokens'] = df[column_name].apply(remove_stopwords)
    return df

In [54]:
grouped_data = remove_stopwords_column(grouped_data, 'data_tokenized')

In [55]:
grouped_data.head(3)

Unnamed: 0,date,data,data_tokenized,cleaned_tokens
0,2021-01-01,the tech giant is now worth nearly 23 trillion...,"[the, tech, giant, is, now, worth, nearly, 23,...","[tech, giant, worth, nearly, 23, trillion, 202..."
1,2021-01-02,specifically an iphone and an apple watch you ...,"[specifically, an, iphone, and, an, apple, wat...","[specifically, iphone, apple, watch, additiona..."
2,2021-01-03,according to apple data linked to you means th...,"[according, to, apple, data, linked, to, you, ...","[according, apple, data, linked, means, data, ..."


In [56]:
grouped_data.cleaned_tokens[0][:20]

['tech',
 'giant',
 'worth',
 'nearly',
 '23',
 'trillion',
 '2020',
 'astounding',
 'year',
 'tech',
 'investors',
 'techheavy',
 'nasdaq',
 'composite',
 'jumped',
 '43',
 'timeframe',
 'bought',
 'new',
 'apple']

### 4.9 Stringify Cleaned Data

In [57]:
def clean_token_data(df, column_name):
    df['cleaned_data'] = df[column_name].apply(lambda x: ' '.join(x))
    return df

In [58]:
grouped_data = clean_token_data(grouped_data, 'cleaned_tokens')

In [59]:
grouped_data.head(3)

Unnamed: 0,date,data,data_tokenized,cleaned_tokens,cleaned_data
0,2021-01-01,the tech giant is now worth nearly 23 trillion...,"[the, tech, giant, is, now, worth, nearly, 23,...","[tech, giant, worth, nearly, 23, trillion, 202...",tech giant worth nearly 23 trillion 2020 astou...
1,2021-01-02,specifically an iphone and an apple watch you ...,"[specifically, an, iphone, and, an, apple, wat...","[specifically, iphone, apple, watch, additiona...",specifically iphone apple watch additionally u...
2,2021-01-03,according to apple data linked to you means th...,"[according, to, apple, data, linked, to, you, ...","[according, apple, data, linked, means, data, ...",according apple data linked means data collect...


# 5. Sentiment Analysis - Using VADER

In [60]:
# Initilizing the Sentiment Analyzer
sent = SentimentIntensityAnalyzer()

In [61]:
def polarity_score(data):
    # Extracting the sentiment polarity scores of a review
    scores = sent.polarity_scores(data)
    
    # Getting the compound score
    compound = scores['compound']

    return compound

In [62]:
grouped_data['sentiment'] = grouped_data.cleaned_data.apply(polarity_score)

In [63]:
grouped_data.head()

Unnamed: 0,date,data,data_tokenized,cleaned_tokens,cleaned_data,sentiment
0,2021-01-01,the tech giant is now worth nearly 23 trillion...,"[the, tech, giant, is, now, worth, nearly, 23,...","[tech, giant, worth, nearly, 23, trillion, 202...",tech giant worth nearly 23 trillion 2020 astou...,0.9993
1,2021-01-02,specifically an iphone and an apple watch you ...,"[specifically, an, iphone, and, an, apple, wat...","[specifically, iphone, apple, watch, additiona...",specifically iphone apple watch additionally u...,0.9975
2,2021-01-03,according to apple data linked to you means th...,"[according, to, apple, data, linked, to, you, ...","[according, apple, data, linked, means, data, ...",according apple data linked means data collect...,0.9993
3,2021-01-04,for apple theres also a basic standalone stock...,"[for, apple, theres, also, a, basic, standalon...","[apple, theres, also, basic, standalone, stock...",apple theres also basic standalone stock marke...,0.9996
4,2021-01-05,apple ceo tim cook delivers a keynote during t...,"[apple, ceo, tim, cook, delivers, a, keynote, ...","[apple, ceo, tim, cook, delivers, keynote, eur...",apple ceo tim cook delivers keynote european u...,0.9999


In [64]:
# polarity score range
minr = grouped_data['sentiment'].min()
maxr = grouped_data['sentiment'].max()

In [65]:
# Print date range
print(f"Date range: {minr} to {maxr}")

Date range: 0.9906 to 1.0


In [66]:
df_out = grouped_data[['date','sentiment']]

In [67]:
df_out.head()

Unnamed: 0,date,sentiment
0,2021-01-01,0.9993
1,2021-01-02,0.9975
2,2021-01-03,0.9993
3,2021-01-04,0.9996
4,2021-01-05,0.9999


In [68]:
def sentiment_to_label(value):
    return 1 if value > 0.05 else 0

In [69]:
df_out['trend'] = df_out['sentiment'].apply(sentiment_to_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['trend'] = df_out['sentiment'].apply(sentiment_to_label)


In [70]:
df_out.head()

Unnamed: 0,date,sentiment,trend
0,2021-01-01,0.9993,1
1,2021-01-02,0.9975,1
2,2021-01-03,0.9993,1
3,2021-01-04,0.9996,1
4,2021-01-05,0.9999,1


In [71]:
df_out.to_csv('./outputs/out_vader.csv', index=False)

## APPROACH 2 - Using FinBERT and VADER on Seperate Datasets

In [72]:
df_news = pd.read_csv('./outputs/data_news.csv')

In [73]:
df_news.head(3)

Unnamed: 0,date,data
0,2021-01-01,the tech giant is now worth nearly 23 trillion...
1,2021-01-01,i bought the new apple watch se this fall the...
2,2021-01-02,specifically an iphone and an apple watch you ...


In [74]:
df_tweets = pd.read_csv('./outputs/data_tweets.csv')

In [75]:
df_tweets.head(3)

Unnamed: 0,date,data
0,2021-01-01,peregreine price target for apple for janfeb t...
1,2021-01-01,aapl looks good if it can clear 133 for 135 137
2,2021-01-01,timcook i ordered an apple macbook air two wee...



### 4.7 Tokenizing

In [83]:
df_news = tokenize_column(df_news, 'data')

In [84]:
df_tweets = tokenize_column(df_tweets, 'data')

### 4.8 Removing Stopwords

In [85]:
df_news = remove_stopwords_column(df_news, 'data_tokenized')

In [86]:
df_tweets = remove_stopwords_column(df_tweets, 'data_tokenized')

### 4.9 Stringify Cleaned Tokens

In [87]:
df_news = clean_token_data(df_news, 'cleaned_tokens')

In [88]:
df_tweets = clean_token_data(df_tweets, 'cleaned_tokens')

In [90]:
df_news.head(3)

Unnamed: 0,date,data,data_tokenized,cleaned_tokens,cleaned_data
0,2021-01-01,the tech giant is now worth nearly 23 trillion...,"[the, tech, giant, is, now, worth, nearly, 23,...","[tech, giant, worth, nearly, 23, trillion, 202...",tech giant worth nearly 23 trillion 2020 astou...
1,2021-01-01,i bought the new apple watch se this fall the...,"[i, bought, the, new, apple, watch, se, this, ...","[bought, new, apple, watch, se, fall, smart, w...",bought new apple watch se fall smart watch use...
2,2021-01-02,specifically an iphone and an apple watch you ...,"[specifically, an, iphone, and, an, apple, wat...","[specifically, iphone, apple, watch, additiona...",specifically iphone apple watch additionally u...


In [91]:
df_tweets.head(3)

Unnamed: 0,date,data,data_tokenized,cleaned_tokens,cleaned_data
0,2021-01-01,peregreine price target for apple for janfeb t...,"[peregreine, price, target, for, apple, for, j...","[peregreine, price, target, apple, janfeb, try...",peregreine price target apple janfeb trying de...
1,2021-01-01,aapl looks good if it can clear 133 for 135 137,"[aapl, looks, good, if, it, can, clear, 133, f...","[aapl, looks, good, clear, 133, 135, 137]",aapl looks good clear 133 135 137
2,2021-01-01,timcook i ordered an apple macbook air two wee...,"[timcook, i, ordered, an, apple, macbook, air,...","[timcook, ordered, apple, macbook, air, two, w...",timcook ordered apple macbook air two weeks ag...


# 5. Sentiment Analysis - FinBERT and VADER

In [76]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [81]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

In [82]:
# Tokenize the text and convert it to tensor format
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    scores = outputs.logits.softmax(dim=1)
    sentiment_score = scores[0].detach().numpy()
    # Return sentiment score for positive class (index 1)
    return sentiment_score[2] - sentiment_score[0]

### 5.1 News - Using FinBERT

In [93]:
# Apply the model to each text
df_news['sentiment'] = df_news['cleaned_data'].apply(predict_sentiment)

In [94]:
out_news = df_news[['date','sentiment']]

In [95]:
# Apply the function to create a new column
out_news['trend'] = out_news['sentiment'].apply(sentiment_to_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_news['trend'] = out_news['sentiment'].apply(sentiment_to_label)


In [96]:
out_news.head()

Unnamed: 0,date,sentiment,trend
0,2021-01-01,-0.997053,0
1,2021-01-01,-0.998911,0
2,2021-01-02,-0.781366,0
3,2021-01-02,0.999997,1
4,2021-01-02,-0.001322,0


In [98]:
out_news['trend'].value_counts()

trend
0    1817
1     322
Name: count, dtype: int64

In [105]:
out_news.to_csv('./outputs/out_news_finbert.csv', index=False)

### 5.2 Tweets - Using VADER

In [99]:
df_tweets['sentiment'] = df_tweets.cleaned_data.apply(polarity_score)

In [100]:
out_tweets = df_tweets[['date','sentiment']]

In [101]:
# Apply the function to create a new column
out_tweets['trend'] = out_tweets['sentiment'].apply(sentiment_to_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_tweets['trend'] = out_tweets['sentiment'].apply(sentiment_to_label)


In [102]:
out_tweets.head()

Unnamed: 0,date,sentiment,trend
0,2021-01-01,0.34,1
1,2021-01-01,0.6705,1
2,2021-01-01,0.5859,1
3,2021-01-01,0.2658,1
4,2021-01-01,0.0,0


In [104]:
out_tweets['trend'].value_counts()

trend
0    67923
1    46730
Name: count, dtype: int64

In [106]:
out_tweets.to_csv('./outputs/out_tweets_vader.csv', index=False)

# 6. Aggregate Outputs

In [107]:
merged_out = pd.concat([out_news, out_tweets], ignore_index=True)

In [108]:
merged_out.head()

Unnamed: 0,date,sentiment,trend
0,2021-01-01,-0.997053,0
1,2021-01-01,-0.998911,0
2,2021-01-02,-0.781366,0
3,2021-01-02,0.999997,1
4,2021-01-02,-0.001322,0


In [110]:
# Group by 'Date' and count the occurrences of each value in 'trend'
grouped_out = merged_out.groupby('date')['trend'].value_counts().unstack(fill_value=0)

In [111]:
# Create 'overall_trend' column based on the count comparison
grouped_out['overall_trend'] = (grouped_out[1] > grouped_out[0]).astype(int)

In [112]:
grouped_out = grouped_out.reset_index().sort_values(by='date')

In [113]:
grouped_out.head()

trend,date,0,1,overall_trend
0,2021-01-01,31,29,0
1,2021-01-02,39,32,0
2,2021-01-03,47,46,0
3,2021-01-04,165,103,0
4,2021-01-05,137,95,0


In [114]:
grouped_out['overall_trend'].value_counts()

overall_trend
0    318
1     46
Name: count, dtype: int64

In [115]:
grouped_out = grouped_out[['date','overall_trend']]

In [117]:
grouped_out.to_csv('./outputs/out_finbert_vader.csv', index=False)

### Based on the analysis, Approach 2 has been selected. The reason for this choice is that Approach 2 offers a more balanced distribution of sentiments.