# Tesla stock price prediction using Elon Musk Tweets

### Implementing the same algorithms as the paper (https://ieeexplore.ieee.org/abstract/document/9566242)

Load packages that are to be used

In [287]:
import numpy as np
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, f1_score
import pandas as pd
import re

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jaych\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Load data from csv

In [288]:
df = pd.read_csv("results.csv")

Preprocess data by removing puntuation, whitespace, URL's, and symbols

In [289]:
for i, row in df.iterrows():
    # Convert from string to array by using quatation regex
    tweetsArray = re.findall("'([^']*)'", row["Tweets"])

    preprocessedTweets = []
    for tweet in tweetsArray:
        # Remove all URL's
        replaced = re.sub(r'http\S+', '', tweet)

        # Remove all punctuation
        replaced = re.sub(r'[^\w\s]', '', replaced)

        # Only use tweet if it is not an empty space
        if replaced != '':
            preprocessedTweets.append(replaced)
    df.at[i, "Tweets"] = preprocessedTweets

Create "Total Likes" column by summing all likes for each row

In [290]:
totalLikes = []
for likes in df["Likes"]:
    likes = likes.replace("[", '').replace("]", '').replace(" ", '')
    split = likes.split(",")
    likes = list(map(int, split))
    totalLikes.append(sum(likes))

# Create new column in dataframe with total likes of tweets for that week
df["Total Likes"] = totalLikes

Create "Total Retweets" column by summing all retweets for each row

In [291]:
totalRetweets = []
for retweets in df["Retweets"]:
    retweets = retweets.replace("[", '').replace("]", '').replace(" ", '')
    split = retweets.split(",")
    retweets = list(map(int, split))
    totalRetweets.append(sum(retweets))

# Create new column in dataframe with total retweets of tweets for that week
df["Total Retweets"] = totalRetweets

Create "Label" column by using the price change. The label will be "1" if the price value rose or stayed the same, and the label will be "-1" if the price value decreased

In [292]:
df["Label"] = np.where(df["Dif"] >= 0, 1, -1)

print(df["Label"].head())

0   -1
1   -1
2    1
3   -1
4    1
Name: Label, dtype: int32


Create VADER SentimentIntensityAnalyzer

In [293]:
sia = SentimentIntensityAnalyzer()

positive_words='buy bull long support undervalued underpriced cheap upward rising trend moon rocket hold breakout call beat support buying holding high profit'
negative_words='sell bear bubble bearish short overvalued overbought overpriced expensive downward falling sold sell low put miss resistance squeeze cover seller '

dictOfpos = { i : 4 for i in positive_words.split(" ") }
dictOfneg = { i : -4 for i in negative_words.split(" ")  }
Financial_Lexicon = {**dictOfpos, **dictOfneg}

sia.lexicon.update(Financial_Lexicon)

Run sentiment analysis on each tweet and create new "Scores" column in dataframe

In [294]:
scores = []
averageScores = []
for tweets in df["Tweets"]:

    compoundScoresCombined = []
    for tweet in tweets:
        polarity_score = sia.polarity_scores(tweet)

        # Ignore the polarity score if it is 0 (neutral)
        if polarity_score["compound"] != 0:
            compoundScoresCombined.append(polarity_score["compound"])

    scores.append(compoundScoresCombined)

    # Get the average compound score, set score to 0 if there are no tweets
    if compoundScoresCombined:
        averageScores.append(statistics.fmean(compoundScoresCombined))
    else:
        averageScores.append(0)

# Create new column in dataframe with array of scores for each tweet
df["Scores"] = scores

# Create new column in dataframe with average score for each row
df["Average Score"] = averageScores

print("Scores:\n", df["Scores"].head())
print("\nAverage scores:\n", df["Average Score"].head())

Scores:
 0                                     [0.5719, 0.5106]
1                                   [-0.2023, -0.3182]
2    [-0.659, 0.7096, 0.7184, 0.891, -0.1027, -0.7264]
3                                            [-0.0516]
4                                     [-0.8555, 0.743]
Name: Scores, dtype: object

Average scores:
 0    0.541250
1   -0.260250
2    0.138483
3   -0.051600
4   -0.056250
Name: Average Score, dtype: float64


Split data into 80/20 training/test split

In [295]:
X = df[["Total Likes", "Total Retweets", "Average Score"]]
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

Run on SVC model

In [296]:
clf = SVC()
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

score = accuracy_score(y_test, y_predict)
print("Accuracy is:", score)

fscore = f1_score(y_test, y_predict)
print("F-score is:", fscore)

fpr, tpr, thresholds = roc_curve(y_test, y_predict)
auc = auc(fpr, tpr)
print("Area Under Curve:", auc)

classificationReport = classification_report(y_test, y_predict, target_names=['Price Drop', 'Price Rise'])
print("\nClassification Report:\n", classificationReport)

Accuracy is: 0.5246636771300448
F-score is: 0.6787878787878787
Area Under Curve: 0.48731972127694057

Classification Report:
               precision    recall  f1-score   support

  Price Drop       0.36      0.05      0.09       102
  Price Rise       0.54      0.93      0.68       121

    accuracy                           0.52       223
   macro avg       0.45      0.49      0.38       223
weighted avg       0.45      0.52      0.41       223

