In [2]:
import numpy as np
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch


# Tweets dataset

* Tweets dataset is downloaded from Kaggle through URL: https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?resource=download
* This dataset is for Tesla stock, containing 80,793 tweets from 2021-09-30 to 2022-09-29.
* The tweets are in text format, but contains emoji and URL, which creates problems for sentiment generation using pretrained HuggingFace NPL model.

In [4]:
data_tweets = pd.read_csv('Tweets_from_Kaggle.csv')
print(data_tweets.shape)
data_tweets.tail()

(80793, 5)


Unnamed: 0.1,Unnamed: 0,Date,Tweet,Stock Name,Company Name
80788,80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,TSLA,XPeng Inc.
80789,80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",TSLA,XPeng Inc.
80790,80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,TSLA,XPeng Inc.
80791,80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",TSLA,XPeng Inc.
80792,80792,2021-09-30 10:22:52+00:00,Why can XPeng P5 deliver outstanding performan...,TSLA,XPeng Inc.


In [None]:
list(data_tweets['Tweet'])

['Mainstream media has done an amazing job at brainwashing people. Today at work, we were asked what companies we believe in &amp; I said @Tesla because they make the safest cars &amp; EVERYONE disagreed with me because they heard“they catch on fire &amp; the batteries cost 20k to replace”',
 'Tesla delivery estimates are at around 364k from the analysts. $tsla',
 '3/ Even if I include 63.0M unvested RSUs as of 6/30, additional equity needed for the RSUs is 63.0M x $54.20 = $3.4B. If the deal closed tomorrow at $54.20, Elon would need $2.0B for existing shares plus $3.4B for RSUs, so $5.4B new equity. $twtr $tsla',
 '@RealDanODowd @WholeMarsBlog @Tesla Hahaha why are you still trying to stop Tesla FSD bro! Get your shit together and make something better? Thats how companies work, they competed. Crying big old ass fart clown!',
 '@RealDanODowd @Tesla Stop trying to kill kids, you sad deranged old man',
 '@RealDanODowd @Tesla This is you https://t.co/3Ml1XawSEb',
 'For years @WholeMarsB

# Data Processing

Clean tweets, getting rid of emoji characters and URLs, preparing for tokenization for pretrained NPL generating sentiments

In [None]:
import re
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"  # other symbols
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

input_tweets = data_tweets['Tweet'].apply(remove_emojis).apply(remove_url)
input_tweets = list(input_tweets)

# Generate sentiments using pretrained NPL

* The pre-trained model, FinBert, is downloaded from HugginFace. Its brief introduction is below:
>FinBERT is a BERT model pre-trained on financial communication text. The purpose is to enhance financial NLP research and practice.
It is trained on total corpora size of 4.9B tokens, and fine-tuned on 10,000 manually annotated (positive, negative, neutral)
sentences from analyst reports. This model achieves superior performance on financial tone analysis task.
* It take around 6 hours to run the model on the Tweets dataset, using Google Colab.
* The result is saved under the same directory of this notebook, preventing rework due to Google Colab disconnection.
* The original idea is to use one more pre-trained HuggingFace to create another sentiment dataset. However, the long-time data cleaning and model running make it impossible to achieve within the time constrains.
> model="finiteautomata/bertweet-base-sentiment-analysis"

In [None]:
# FinBERT is a BERT model pre-trained on financial communication text. The purpose is to enhance financial NLP research and practice.
# It is trained on total corpora size of 4.9B tokens, and fine-tuned on 10,000 manually annotated (positive, negative, neutral)
# sentences from analyst reports. This model achieves superior performance on financial tone analysis task.

model_name ="yiyanghkust/finbert-tone"
model_finbert = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_finbert = AutoTokenizer.from_pretrained(model_name)
classifier_finbert = pipeline("sentiment-analysis", model=model_finbert, tokenizer=tokenizer_finbert)
results_finbert = classifier_finbert(input_tweets)
results_finbert.to_csv('Finbert_results.csv')
