In [1]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [32]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("mjw/stock_market_tweets")

# The dataset object is a dictionary with splits as keys (e.g., 'train', 'test')
print(dataset)

# Convert the 'train' split to a pandas dataframe
tweets_df = dataset['train'].to_pandas()


Found cached dataset csv (/home/hossein/.cache/huggingface/datasets/mjw___csv/mjw--stock_market_tweets-996d08df63e7be72/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'tweet_id', 'writer', 'post_date', 'body', 'comment_num', 'retweet_num', 'like_num', 'ticker_symbol'],
        num_rows: 1700641
    })
})


In [33]:
tweets_df.to_csv('stock_market_tweets.csv', index=False)

In [34]:
print(tweets_df.head())

   Unnamed: 0            tweet_id           writer   post_date  \
0           0  550441672312512512      KeralaGuy77  2015-01-01   
1           1  550452877466935296     TheTrendIsUp  2015-01-01   
2           2  550456665607122944       t_nathan95  2015-01-01   
3           3  550459042787651584  petergo99037185  2015-01-01   
4           4  550461555423584257       t_nathan95  2015-01-01   

                                                body  comment_num  \
0  Insanity of today weirdo massive selling. $aap...            0   
1  My biggest winner in 2014: Inverse Volatility ...            1   
2  Had a down day of -.66%. Worst performer was $...            0   
3  YR %,  /-, $TSLA  47.85%, $FB  42.77%, $TWTR -...            0   
4  Prediction: $TWTR $GRPN $YELP are acquired as ...            0   

   retweet_num  like_num ticker_symbol  
0            0         0          AAPL  
1            0         0          AAPL  
2            0         0          AAPL  
3            0         0

In [35]:
# Define the companies of interest
companies_of_interest = ['AMZN', 'GOOG', 'MSFT']

# Filter the dataframe for tweets related to the companies of interest
filtered_tweets_df = tweets_df[tweets_df['ticker_symbol'].isin(companies_of_interest)]

# Display the first few entries of the filtered dataframe
print(filtered_tweets_df.head())

    Unnamed: 0            tweet_id           writer   post_date  \
4            4  550461555423584257       t_nathan95  2015-01-01   
7            7  550463776437174272   The_Dumb_Money  2015-01-01   
11          11  550466945061908482   The_Dumb_Money  2015-01-01   
12          12  550467140688838656    andrewlabutka  2015-01-01   
13          13  550468859879505921  UnderGradStocks  2015-01-01   

                                                 body  comment_num  \
4   Prediction: $TWTR $GRPN $YELP are acquired as ...            0   
7   $goog is roughly fairly valued, not a buy or s...            0   
11  IMHO, $MSFT shockingly remains MILDLY undervalued            1   
12  @A_TRON3000 An idiot could run $AMZN at a prof...            0   
13  @CNNMoney I like $MSFT with him in charge. He ...            0   

    retweet_num  like_num ticker_symbol  
4             0         1          GOOG  
7             0         0          GOOG  
11            0         0          MSFT  
12      

In [36]:
tweets_df.to_csv('filtered_stock_market_tweets.csv', index=False)

In [37]:
print(filtered_tweets_df.head(35))

     Unnamed: 0            tweet_id           writer   post_date  \
4             4  550461555423584257       t_nathan95  2015-01-01   
7             7  550463776437174272   The_Dumb_Money  2015-01-01   
11           11  550466945061908482   The_Dumb_Money  2015-01-01   
12           12  550467140688838656    andrewlabutka  2015-01-01   
13           13  550468859879505921  UnderGradStocks  2015-01-01   
16           16  550471417754845184       SentiQuant  2015-01-01   
23           23  550494675937665026      bullriders1  2015-01-01   
28           28  550513952820432897   TradeInTheZone  2015-01-01   
31           31  550521222492995584           m3pols  2015-01-01   
45           45  550622668156710912    XenophonBoone  2015-01-01   
48           48  550652616670470144       SentiQuant  2015-01-01   
51           51  550667881390698496      garynielson  2015-01-01   
61           61  550683136913588224          CapCube  2015-01-01   
62           63  550694050169237504      lillian

In [38]:
# we make a copy to avoid the SettingWithCopyWarning
filtered_tweets_df_1 = filtered_tweets_df.copy()


In [39]:
import pandas as pd
import re

# Assuming filtered_tweets_df_1 is already defined and is the dataframe you want to work on

# Convert post_date to datetime if it's not already
filtered_tweets_df_1['post_date'] = pd.to_datetime(filtered_tweets_df_1['post_date'])

# Cleaning the tweets text
def clean_tweet_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove user mentions
    text = re.sub(r'#', '', text)  # Remove the hashtag symbol but keep the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    return text.strip()

# Apply the cleaning function to the tweet body
filtered_tweets_df_1['clean_body'] = filtered_tweets_df_1['body'].apply(clean_tweet_text)

# Aggregate tweets per day by counting them
daily_sentiment = filtered_tweets_df_1.groupby(['post_date', 'ticker_symbol']).size().reset_index(name='tweet_count')


In [40]:
filtered_tweets_df_1.head(35)

Unnamed: 0.1,Unnamed: 0,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num,ticker_symbol,clean_body
4,4,550461555423584257,t_nathan95,2015-01-01,Prediction: $TWTR $GRPN $YELP are acquired as ...,0,0,1,GOOG,Prediction TWTR GRPN YELP are acquired as big ...
7,7,550463776437174272,The_Dumb_Money,2015-01-01,"$goog is roughly fairly valued, not a buy or s...",0,0,0,GOOG,goog is roughly fairly valued not a buy or sell
11,11,550466945061908482,The_Dumb_Money,2015-01-01,"IMHO, $MSFT shockingly remains MILDLY undervalued",1,0,0,MSFT,IMHO MSFT shockingly remains MILDLY undervalued
12,12,550467140688838656,andrewlabutka,2015-01-01,@A_TRON3000 An idiot could run $AMZN at a prof...,0,0,0,AMZN,An idiot could run AMZN at a profit but only B...
13,13,550468859879505921,UnderGradStocks,2015-01-01,@CNNMoney I like $MSFT with him in charge. He ...,0,0,0,MSFT,I like MSFT with him in charge He seems to enc...
16,16,550471417754845184,SentiQuant,2015-01-01,#SENTISHIFTUP $X $T $GOOGL $AMRN $UPIP $CNAT $...,0,0,0,GOOG,SENTISHIFTUP X T GOOGL AMRN UPIP CNAT GDX DAX ...
23,23,550494675937665026,bullriders1,2015-01-01,Watch Out $AMZN Fresh!... Instacart's $2B home...,0,0,1,AMZN,Watch Out AMZN Fresh Instacarts B home deliver...
28,28,550513952820432897,TradeInTheZone,2015-01-01,Loved that $Googl ad,0,0,0,GOOG,Loved that Googl ad
31,31,550521222492995584,m3pols,2015-01-01,Bizarre Global Times editorial re $GOOG email ...,0,0,0,GOOG,Bizarre Global Times editorial re GOOG email b...
45,45,550622668156710912,XenophonBoone,2015-01-01,MY BOOK $SNE short Close $20.47 52wk High $22....,0,0,1,MSFT,MY BOOK SNE short Close wk High Serious dama...


In [41]:
daily_sentiment.head()

Unnamed: 0,post_date,ticker_symbol,tweet_count
0,2015-01-01,AMZN,14
1,2015-01-01,GOOG,6
2,2015-01-01,MSFT,8
3,2015-01-02,AMZN,56
4,2015-01-02,GOOG,54


In [28]:
!pip install torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the pre-trained model and tokenizer
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to perform sentiment analysis
def sentiment_analysis(texts, model, tokenizer):
    # Tokenize the texts
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    
    # Predict sentiments
    with torch.no_grad():
        outputs = model(**encoded_input)
    
    # Get scores and convert to probabilities using softmax
    scores = outputs.logits.softmax(dim=-1)
    
    # Convert scores to list of dictionaries with sentiment labels
    sentiments = scores.argmax(dim=-1)
    return sentiments.numpy()

# Example usage
example_texts = ["I love this stock, it's going up!", "This company is going bankrupt."]
sentiments = sentiment_analysis(example_texts, model, tokenizer)
print(sentiments)


[2 0]


In [43]:
# Define the sentiment mapping function
def map_sentiment(prediction_index):
    # Define the mapping from indices to sentiment scores
    sentiment_mapping = {0: -1, 1: 0, 2: 1}  # Update this as per model output
    return sentiment_mapping.get(prediction_index, 0)  # Default to neutral

# Apply the sentiment analysis to each tweet and map the results to scores
filtered_tweets_df_1['sentiment_index'] = filtered_tweets_df_1['clean_body'].apply(
    lambda text: map_sentiment(sentiment_analysis([text], model, tokenizer)[0])
)

# Group by 'post_date' and 'ticker_symbol' to calculate the mean sentiment score for each day
daily_sentiment_index = filtered_tweets_df_1.groupby(['post_date', 'ticker_symbol'])['sentiment_index'].mean().reset_index()

# Display the daily sentiment index
print(daily_sentiment_index.head())


   post_date ticker_symbol  sentiment_index
0 2015-01-01          AMZN        -0.285714
1 2015-01-01          GOOG         0.000000
2 2015-01-01          MSFT         0.000000
3 2015-01-02          AMZN         0.035714
4 2015-01-02          GOOG         0.000000


In [44]:
daily_sentiment_index.to_csv('daily_sentiment_index.csv', index=False)

In [45]:
filtered_tweets_df_1.to_csv('filtered_tweets_df_1_sentiment_result.csv', index=False)

In [46]:
print(daily_sentiment_index.head(35))

    post_date ticker_symbol  sentiment_index
0  2015-01-01          AMZN        -0.285714
1  2015-01-01          GOOG         0.000000
2  2015-01-01          MSFT         0.000000
3  2015-01-02          AMZN         0.035714
4  2015-01-02          GOOG         0.000000
5  2015-01-02          MSFT        -0.095238
6  2015-01-03          AMZN         0.285714
7  2015-01-03          GOOG         0.000000
8  2015-01-03          MSFT        -0.142857
9  2015-01-04          AMZN         0.285714
10 2015-01-04          GOOG         0.266667
11 2015-01-04          MSFT         0.000000
12 2015-01-05          AMZN         0.019231
13 2015-01-05          GOOG        -0.042553
14 2015-01-05          MSFT         0.157895
15 2015-01-06          AMZN         0.192308
16 2015-01-06          GOOG         0.042857
17 2015-01-06          MSFT         0.333333
18 2015-01-07          AMZN         0.200000
19 2015-01-07          GOOG         0.000000
20 2015-01-07          MSFT        -0.090909
21 2015-01

In [47]:
filtered_tweets_df_1.head(35)

Unnamed: 0.1,Unnamed: 0,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num,ticker_symbol,clean_body,sentiment_index
4,4,550461555423584257,t_nathan95,2015-01-01,Prediction: $TWTR $GRPN $YELP are acquired as ...,0,0,1,GOOG,Prediction TWTR GRPN YELP are acquired as big ...,1
7,7,550463776437174272,The_Dumb_Money,2015-01-01,"$goog is roughly fairly valued, not a buy or s...",0,0,0,GOOG,goog is roughly fairly valued not a buy or sell,0
11,11,550466945061908482,The_Dumb_Money,2015-01-01,"IMHO, $MSFT shockingly remains MILDLY undervalued",1,0,0,MSFT,IMHO MSFT shockingly remains MILDLY undervalued,-1
12,12,550467140688838656,andrewlabutka,2015-01-01,@A_TRON3000 An idiot could run $AMZN at a prof...,0,0,0,AMZN,An idiot could run AMZN at a profit but only B...,0
13,13,550468859879505921,UnderGradStocks,2015-01-01,@CNNMoney I like $MSFT with him in charge. He ...,0,0,0,MSFT,I like MSFT with him in charge He seems to enc...,1
16,16,550471417754845184,SentiQuant,2015-01-01,#SENTISHIFTUP $X $T $GOOGL $AMRN $UPIP $CNAT $...,0,0,0,GOOG,SENTISHIFTUP X T GOOGL AMRN UPIP CNAT GDX DAX ...,0
23,23,550494675937665026,bullriders1,2015-01-01,Watch Out $AMZN Fresh!... Instacart's $2B home...,0,0,1,AMZN,Watch Out AMZN Fresh Instacarts B home deliver...,0
28,28,550513952820432897,TradeInTheZone,2015-01-01,Loved that $Googl ad,0,0,0,GOOG,Loved that Googl ad,0
31,31,550521222492995584,m3pols,2015-01-01,Bizarre Global Times editorial re $GOOG email ...,0,0,0,GOOG,Bizarre Global Times editorial re GOOG email b...,-1
45,45,550622668156710912,XenophonBoone,2015-01-01,MY BOOK $SNE short Close $20.47 52wk High $22....,0,0,1,MSFT,MY BOOK SNE short Close wk High Serious dama...,-1
