In [2]:
import numpy as np
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch

Mounted at /content/drive


# Data Processing

### Step 1: read sentiment generation result saved under the same directory

In [115]:
# read sentiment data from
data_read = pd.read_csv('Finbert_results.csv', index_col=0)

### Step 2: data processing
1. Convert string lable to numberic.
2. Change data type of Date field from string to date type.
3. Aggeregate sentiment by date, and categorize into -1, 0, and 1.


In [117]:
data_read['label'] = data_read['label'].map({'Positive':1., "Neutral":0., 'Negative':-1.})
data_read['Date'] = pd.to_datetime(data_tweets['Date'], format='ISO8601').dt.date
data_sentiment = data_read.groupby('Date')[['label']].sum()
data_sentiment['Sentiment'] = data_sentiment['label'].clip(lower=-1, upper=1)  # equal and greater than 1, equals 1 (positive); less and equal to -1, equals -1 (negative)
data_sentiment = data_sentiment.drop(columns=['label'])
data_sentiment_shifted = data_sentiment.shift(periods=-1)

### Step 3: install yfinance

In [79]:
# install yfinance to download "TSLA" stock data from Yahoo Finance.
!pip install yfinance



### Step 4: download stock data

In [118]:
# Download "TSLA" stock data from Yahoo Finance.
import yfinance as yf
stock_data = yf.download(tickers='TSLA', start=data_sentiment.index.min(),
                                         end=data_sentiment.index.max(),
                                         interval='1d')

[*********************100%***********************]  1 of 1 completed


# Using Categorical Features

## Create categorical features

In [119]:
# create categorical features
stock_move = stock_data.pct_change().map(lambda x: 1. if x >= 0.001 else -1. if x <= -0.001 else 0.)   # price change is flat if the |change| <= 0.1%
stock_move = stock_move[1:]    # the 1st record is N/A after applying .pct_change()

# join the two data to obtain sentiment data matching stock data by date
dataset = stock_move.join(data_sentiment_shifted)
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-01,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0
2021-10-04,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2021-10-05,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
2021-10-06,-1.0,-1.0,-1.0,1.0,1.0,-1.0,0.0
2021-10-07,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## Analyze relationship between sentiment and stock price movement.

In [120]:
# build confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(dataset['Close'], dataset['Sentiment'])
accuracy = accuracy_score(dataset['Close'], dataset['Sentiment'])
print("Confusion Matrix:", cm, sep="\n")
print("accuracy:", accuracy, sep="\n")

Confusion Matrix:
[[  0 115   0]
 [  0   4   0]
 [  0 131   0]]
accuracy:
0.016


Conclusion:
1. Relationship between sentiment and stock price change is almost obvious.
2. As number of price-up (=131) and price-down (=115) are pretty much equally distributed, the dataset does not suffering from class imbalance issues.
3. The minority class price-flat (=3) is very minor, and is not our focus, so it is ignorable in this exercise, as the cost to address this class imbalance is much higher than its benefit in time-series forecast.
4. the accuracy 0.524 before any model training will be our benchmark for model efficiency evaluation.

## Feature Engineering

1. Preliminary (full) features includes: Open, High, Low, Close, Adj-Close, Volume, and Sentiment.
2. Target: Close (shift by one-day)
3. There is no NA value and outliers (all featurs categorized with value -1, 0, 1), additonal data cleaning or optimizing is not needed.
4. We use RandomForestClassification to estimate features importance.

### Step 1: creating training datasets

In [121]:
# form X, y
stock_move_shifted = stock_move.shift(periods=1)[1:]
X_prelim = stock_move_shifted.join(data_sentiment_shifted)
y = stock_move['Close'][1:]
print(len(X_prelim), len(y))
X_prelim.head()

249 249


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-04,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0
2021-10-05,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2021-10-06,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
2021-10-07,-1.0,-1.0,-1.0,1.0,1.0,-1.0,0.0
2021-10-08,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [122]:
# split dataset for training and test
# full features = [Open, High, Low, Close, Adj-Close, Volume, Sentiment]
from sklearn.model_selection import train_test_split
X_prelim_train, X_prelim_test, y_train, y_test = train_test_split(X_prelim, y, test_size=0.2, random_state=42)

###Step 2: feature importance analysis, using RandomForestClassifier

In [123]:
# Feature importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# model training
rnd_clf = RandomForestClassifier(n_estimators=200, max_leaf_nodes=2, n_jobs=-1, random_state=42)
rnd_clf.fit(X_prelim_train, y_train)
y_pred = rnd_clf.predict(X_prelim_test)
cm = confusion_matrix(y_test, y_pred)
print("accuracy:", rnd_clf.score(X_prelim_test, y_test), sep="\n")
print("Confusion Matrix:", cm, sep="\n")

# feature importance analysis
print("\n", '-------------------feature importance-------------------')
for score, name in sorted(zip(rnd_clf.feature_importances_, X_prelim_train.columns)):
    print(round(score, 2), name)


accuracy:
0.54
Confusion Matrix:
[[ 0  0 22]
 [ 0  0  1]
 [ 0  0 27]]

 -------------------feature importance-------------------
0.0 Sentiment
0.12 Adj Close
0.14 Close
0.16 Volume
0.16 Low
0.21 Open
0.22 High


In [124]:
# coefficient correlation analysis
print(round(np.corrcoef(X_prelim['Adj Close'], y)[0,1],4),'Adj Close', sep=" ")
print(round(np.corrcoef(X_prelim['Close'], y)[0,1],4),'Close', sep=" ")
print(round(np.corrcoef(X_prelim['Volume'], y)[0,1],4),'Volume', sep=" ")
print(round(np.corrcoef(X_prelim['Low'], y)[0,1],4),'Low', sep=" ")
print(round(np.corrcoef(X_prelim['Sentiment'], y)[0,1],4),'Sentiment', sep=" ")
print(round(np.corrcoef(X_prelim['High'], y)[0,1],4),'High', sep=" ")
print(round(np.corrcoef(X_prelim['Open'], y)[0,1],4),'Open', sep=" ")

0.0533 Adj Close
0.0533 Close
0.017 Volume
0.021 Low
nan Sentiment
-0.0172 High
-0.0022 Open


  c /= stddev[:, None]
  c /= stddev[None, :]


In [125]:
# features = ['High', 'Open', 'Sentiment']
feature_selection = ['High', 'Open', 'Sentiment']
X_selected = X_prelim[feature_selection]
X_selected_train, X_selected_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

rnd_clf = RandomForestClassifier(n_estimators=200, max_leaf_nodes=2, n_jobs=-1, random_state=42)
rnd_clf.fit(X_selected_train, y_train)
y_pred = rnd_clf.predict(X_selected_test)
cm = confusion_matrix(y_test, y_pred)
print("accuracy:", rnd_clf.score(X_selected_test, y_test), sep="\n")
print("Confusion Matrix:", cm, sep="\n")

accuracy:
0.54
Confusion Matrix:
[[ 0  0 22]
 [ 0  0  1]
 [ 0  0 27]]


In [126]:
# features = ['Sentiment']
X_selected = X_prelim[['Sentiment']]
X_selected_train, X_selected_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=2, n_jobs=-1, random_state=42)
rnd_clf.fit(X_selected_train, y_train)
y_pred = rnd_clf.predict(X_selected_test)
cm = confusion_matrix(y_test, y_pred)
print("accuracy:", rnd_clf.score(X_selected_test, y_test), sep="\n")
print("Confusion Matrix:", cm, sep="\n")

accuracy:
0.54
Confusion Matrix:
[[ 0  0 22]
 [ 0  0  1]
 [ 0  0 27]]


### Conclusion

1. Sentiment is an important features.
2. The prediction power of Sentiment to the 1-day ahead price movement is not significant (almost random).
3. RandomForestClassifier improves Sentiment prediction accuracy by around 2% (from 52% without modeling to 54%)
4. We will apply the most important three features `['Close', 'Adj Close', 'Sentiment']` for subsequent modeling.
5. The predictive power of all features and subset of features are the same.

# Using Numeric features

## Create numeric features

In [127]:
# form X_, y_
y_ = y
stock_data_ = stock_data[1:]
stock_data_shifted_ = stock_data_.shift(periods=1)[1:]
X_prelim_ = stock_data_shifted_.join(data_sentiment_shifted)
print(len(X_prelim_), len(y_))
X_prelim_.tail()

249 249


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-09-22,308.290009,313.799988,300.630005,300.799988,300.799988,62555700.0,0.0
2022-09-23,299.859985,301.290009,285.820007,288.589996,288.589996,70545400.0,0.0
2022-09-26,283.089996,284.5,272.820007,275.329987,275.329987,63748400.0,0.0
2022-09-27,271.829987,284.089996,270.309998,276.01001,276.01001,58076900.0,0.0
2022-09-28,283.839996,288.670013,277.51001,282.940002,282.940002,61925200.0,0.0


## Feature Engineering

### Step 1: creating training datasets

In [128]:
# split dataset for training and test
from sklearn.model_selection import train_test_split
feature_selection = ['Open', 'High','Low', 'Close', 'Adj Close', 'Volume', 'Sentiment']
X_selected_ = X_prelim_[feature_selection]
X_selected_train_, X_selected_test_, y_train_, y_test_ = train_test_split(X_selected_, y, test_size=0.2, random_state=42)

### Step 2: feature importance analysis, using RandomForestClassifier

In [129]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# model training
rnd_clf = RandomForestClassifier(n_estimators=200, max_leaf_nodes=2, n_jobs=-1, random_state=42)
rnd_clf.fit(X_selected_train_, y_train_)
y_pred_ = rnd_clf.predict(X_selected_test_)
cm = confusion_matrix(y_test_, y_pred_)
print("accuracy:", rnd_clf.score(X_selected_test_, y_test_), sep="\n")
print("Confusion Matrix:", cm, sep="\n")

# feature importance analysis
print("\n", '-------------------feature importance-------------------')
for score, name in sorted(zip(rnd_clf.feature_importances_, X_selected_train_.columns)):
    print(round(score, 2), name)


accuracy:
0.6
Confusion Matrix:
[[10  0 12]
 [ 0  0  1]
 [ 7  0 20]]

 -------------------feature importance-------------------
0.0 Sentiment
0.11 Volume
0.13 Adj Close
0.17 Low
0.18 High
0.19 Close
0.21 Open


In [130]:
# split dataset for training and test
from sklearn.model_selection import train_test_split
feature_selection = ['Open', 'Low', 'Close']
X_selected_ = X_prelim_[feature_selection]
X_selected_train_, X_selected_test_, y_train_, y_test_ = train_test_split(X_selected_, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# model training
rnd_clf = RandomForestClassifier(n_estimators=200, max_leaf_nodes=2, n_jobs=-1, random_state=42)
rnd_clf.fit(X_selected_train_, y_train_)
y_pred_ = rnd_clf.predict(X_selected_test_)
cm = confusion_matrix(y_test_, y_pred_)
print("accuracy:", rnd_clf.score(X_selected_test_, y_test_), sep="\n")
print("Confusion Matrix:", cm, sep="\n")


accuracy:
0.56
Confusion Matrix:
[[10  0 12]
 [ 0  0  1]
 [ 9  0 18]]


### Conclusion

1. Sentiment is not an important feature in the neumeric features approach. This conclusion may change if Sentient converts to numeric features (see Future Improvement).
2. Using numeric features significantly improves performance by 6% comparing to using categorical features. This is because numeric features provide more granular details for model computation.
3. Using full features get the highest performance. Since the categorical approach is indifferent to feature selection, we will use full feature for subseqent modeling.

# Train with Other Models

## Creating training datasets

In [131]:
# Dataset creation
# create categorical dataset
X_cat_train, X_cat_test, y_train, y_test = train_test_split(X_prelim, y, test_size=0.2, random_state=42)
# create numeric dataset
X_num_train, X_num_test, y_train, y_test = train_test_split(X_prelim_, y, test_size=0.2, random_state=42)

## Decision Tree

In [132]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=1, random_state=42)

print('--------categorical features--------')
clf.fit(X_cat_train, y_train)
cm = confusion_matrix(y_test, clf.predict(X_cat_test))
accuracy = clf.score(X_cat_test, y_test)
print("accuracy:", accuracy, sep="\n")
print("Confusion Matrix:", cm, sep="\n")

print('\n--------numeric features--------')
clf.fit(X_num_train, y_train)
cm = confusion_matrix(y_test, clf.predict(X_num_test))
accuracy = clf.score(X_num_test, y_test)
print("accuracy:", accuracy, sep="\n")
print("Confusion Matrix:", cm, sep="\n")

--------categorical features--------
accuracy:
0.54
Confusion Matrix:
[[ 0  0 22]
 [ 0  0  1]
 [ 0  0 27]]

--------numeric features--------
accuracy:
0.62
Confusion Matrix:
[[13  0  9]
 [ 0  0  1]
 [ 9  0 18]]


## Adaboost

In [133]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3), n_estimators=20,
    learning_rate=0.3, random_state=42)

print('--------categorical features--------')
clf.fit(X_cat_train, y_train)
cm = confusion_matrix(y_test, clf.predict(X_cat_test))
accuracy = clf.score(X_cat_test, y_test)
print("accuracy:", accuracy, sep="\n")
print("Confusion Matrix:", cm, sep="\n")

print('\n--------numeric features--------')
clf.fit(X_num_train, y_train)
cm = confusion_matrix(y_test, clf.predict(X_num_test))
accuracy = clf.score(X_num_test, y_test)
print("accuracy:", accuracy, sep="\n")
print("Confusion Matrix:", cm, sep="\n")

--------categorical features--------
accuracy:
0.54
Confusion Matrix:
[[15  0  7]
 [ 1  0  0]
 [15  0 12]]

--------numeric features--------
accuracy:
0.5
Confusion Matrix:
[[11  0 11]
 [ 0  0  1]
 [13  0 14]]


## KNN

In [134]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
k_max = 50

print('--------categorical features--------')
lst_accuracy = []
for k in range(2, k_max+1):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_cat_train, y_train)
    accuracy = knn.score(X_cat_test, y_test)
    lst_accuracy.append(accuracy)

accuracy = max(lst_accuracy)
k = lst_accuracy.index(accuracy) + 2

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_cat_train, y_train)
cm = confusion_matrix(y_test, knn.predict(X_cat_test))
print("accuracy:", accuracy, sep="\n")
print("Confusion Matrix:", cm, sep="\n")


print('\n--------numeric features--------')
lst_accuracy = []
for k in range(2, k_max+1):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_num_train, y_train)
    accuracy = knn.score(X_num_test, y_test)
    lst_accuracy.append(accuracy)

accuracy = max(lst_accuracy)
k = lst_accuracy.index(accuracy) + 2

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_num_train, y_train)
cm = confusion_matrix(y_test, knn.predict(X_num_test))
print("accuracy:", accuracy, sep="\n")
print("Confusion Matrix:", cm, sep="\n")

--------categorical features--------
accuracy:
0.54
Confusion Matrix:
[[ 9  0 13]
 [ 1  0  0]
 [ 9  0 18]]

--------numeric features--------
accuracy:
0.64
Confusion Matrix:
[[15  0  7]
 [ 0  0  1]
 [10  0 17]]


## Conclusion

Accuracy Summary

Features    |RandomForest|Decision Tree| AdaBoost    | KNN   
------------|------------|-------------|-------------|----------
Categorical | 0.54       | 0.54        | 0.50        | 0.54
Numeric     | 0.60       | 0.62        | 0.70        | 0.64

1. Overall, numeric features perform significantly better than categorical features in all models trained.
2. The best performing model is AdaBoost using numeric features, reaching an accuracy of 70%

# Future improvement

1. Sentiment can also be converted into numberic features, future experiments are needed to test if that would help improve performance.
2. Moving average of feature may improve time-series forecast.
3. PCA can be another mehtod for feature selection, thus worth future experiments are needed to test if that would help improve performance.
4. It worths to do feature selection in other modelings, just in case of exceptional senarios.
4. As total number of samples is 249, too small to train a deep neural network. However, transfer learning could be a solution to break this limitation.