# Sentiment Analysis to predict stock price direction using Classifier models

Sentiment Analysis is the use of NLP, text analysis and
computational linguistics to determine subjective information. Instead of building our own lexicon to do this, we will use VADER (Valence Aware Dictionary and sEntiment Reasoner), a pre-trained sentiment analysis model included in the NLTK package. We will also use TextBlob, a simple API built upon NLTK, for common NLP tasks.

## 2. Install/import libraries

In [1]:
!pip install yfinance



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import seaborn as sns
import seaborn as sns
import math
import datetime
import re
import yfinance as yf
import nltk
import warnings
warnings.filterwarnings('ignore')

from datetime import date, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')
from textblob import TextBlob

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


## 3. Import data

In [3]:
# Show DataFrame of article sentiments data

article_sentiments = pd.read_pickle('azn_article_sentiments_20210105.pkl')
article_sentiments

FileNotFoundError: [Errno 2] No such file or directory: 'azn_article_sentiments_20210105.pkl'

In [None]:
# Create copy of DataFrame

article_sentiments_azn = article_sentiments.copy()
article_sentiments_azn.head(5)

## 4. Clean news data

In [None]:
# Append ---newarticle--- to split for NLP

article_sentiments_azn['body_text'] = article_sentiments_azn['body_text'].astype(str) + '---newarticle---'

In [None]:
azn_bodytext = article_sentiments_azn['body_text']
azn_bodytext

In [None]:
pd.set_option("display.max_colwidth", -1)  # to display full text
azn_bodytext

In [None]:
# Save article_sentiments_azn to txt

with open('azn_bodytext_20210105.txt', 'w') as f:
    f.write(
        azn_bodytext.to_string(header = False, index = False)
    )

### Remove spaces in text file

In [None]:
# first get all lines from file
with open('azn_bodytext_20210105.txt', 'r') as f:
    lines = f.readlines()

# remove spaces
lines = [line.replace(' ', '') for line in lines]

# finally, write lines in the file
with open('azn_bodytext_20210105.txt', 'w') as f:
    f.writelines(lines)

### Remove end line breaks from text file

In [None]:
# first get all lines from file
a_file = open("azn_bodytext_20210105.txt", "r")

# create variable for string without line breaks
string_without_line_breaks = ""

# iterate over strings
for line in a_file:
  stripped_line = line.rstrip() # rstrip() method removes any trailing characters - space is the default trailing character to remove
  string_without_line_breaks += stripped_line
a_file.close()

In [None]:
# finally, write lines in the file
with open('azn_bodytext_20210105.txt', 'w') as f:
    f.writelines(string_without_line_breaks)

In [None]:
# Read article sentiments data into DataFrame

azn_news_df = pd.read_pickle('azn_article_sentiments_20210105.pkl')
azn_news_df

In [None]:
# Check data types

azn_news_df.dtypes

In [None]:
# Create copy of DataFrame

azn_news_df_new = azn_news_df.copy()

### Drop rows without publish date

In [None]:
# Drop rows with None in publish_date column

azn_news_df_new = azn_news_df_new.replace(to_replace='None', value=np.nan).dropna()
azn_news_df_new

### Drop rows with duplicate titles

In [None]:
# Dropping all duplicate titles keeping only the first instance

azn_news_df_new.drop_duplicates(subset ="title",
                     keep = 'first', inplace = True)

azn_news_df_new

### Set datetime index

In [None]:
# Set string representation of date to datetime index

azn_news_df_new['Date'] = pd.to_datetime(azn_news_df_new.publish_date)
azn_news_df_new.set_index('Date', inplace=True)
azn_news_df_new

In [None]:
# Show dateime index of new DataFrame

azn_news_df_new.index

### Sort in ascending chronological order

In [None]:
# Sort in chronological order

azn_news_df_new = azn_news_df_new.sort_index()
azn_news_df_new

In [None]:
# Save sorted DataFrame

azn_news_df_new.to_pickle("azn_news_df_new_20210106.pkl")

In [None]:
azn_news_df_new.to_csv("azn_news_df_new_20210106.csv", sep=',', encoding='utf-8', header=True)

In [None]:
azn_news_df_new.dtypes

In [None]:
# Create copy of DataFrame

azn_news_df_combined = azn_news_df_new.copy()

### Combine articles published on same date

Combine all news articles published on same date to get a single score. An alternative method could be to take the mean score of all articles published on the same date.

In [None]:
# Combining all the news published on a single date in a separate column

azn_news_df_combined['news_combined'] = azn_news_df_combined.groupby(['publish_date'])['body_text'].transform(lambda x: ' '.join(x))

azn_news_df_combined

### Drop rows with duplicate dates

In [None]:
# Dropping duplicate dates keeping only the first instance

azn_news_df_combined.drop_duplicates(subset ="publish_date",
                     keep = 'first', inplace = True)

In [None]:
# Show DataFrame to check that the number of rows has decreased

azn_news_df_combined

In [None]:
# Save combined DataFrame without duplicates

azn_news_df_combined.to_csv("azn_news_df_combined_20210106.csv", sep=',', encoding='utf-8', header=True)

In [None]:
azn_news_df_combined = pd.read_csv("azn_news_df_combined_20210106.csv")
azn_news_df_combined.head()

### Set datetime index

In [None]:
# Set string representation of date to datetime index

azn_news_df_combined['Date'] = pd.to_datetime(azn_news_df_combined.publish_date)
azn_news_df_combined.set_index('Date', inplace=True)
azn_news_df_combined

## 5. Import historical stock data

Import stock data for same period as news data from Yahoo! Finance using yfinance API.

In [None]:
azn_stock_df = yf.download("AZN.L", start="2014-05-02", end="2021-01-05")
azn_stock_df

### Visualise Adjusted Close price and Volume

In [None]:
# Plot Adjusted Close price and Volume

top = plt.subplot2grid((4,4), (0, 0), rowspan=3, colspan=4)
top.plot(azn_stock_df.index, azn_stock_df['Adj Close'], label = 'Adjusted Close price')
plt.title('AZN.L Adj Close Price')
plt.legend(loc=2)
bottom = plt.subplot2grid((4,4), (3,0), rowspan=1, colspan=4)
bottom.bar(azn_stock_df.index, azn_stock_df["Volume"])
plt.title('AZN.L Daily Trading Volume')
plt.gcf().set_size_inches(12,8)
plt.subplots_adjust(hspace=0.75)

## 6. Merge Stock and Sentiment Dataframes on Date

In [None]:
# Merge data sets on date
merge = azn_stock_df.merge(azn_news_df_combined, how='inner', left_index=True, right_index=True)


In [None]:
# Show merged data set
merge

In [None]:
# Save merged DataFrame

merge.to_csv("azn_news_stock_merge_20210107.csv", sep=',', encoding='utf-8', header=True)

In [None]:
# Show first row in combined news column

merge['news_combined'].iloc[0]

In [None]:
# Iterate over rows in combined news column

for index, row in merge.iterrows():
    print (row["news_combined"])

### Clean data in combined news column

Strip newline escape sequence (\n), unwanted punctuation and backslashes.  

In [None]:
# Create empty list to append cleaned data from combined news column

clean_news = []

for i in range(0, len(merge["news_combined"])):
    clean_news.append(re.sub("\n", ' ', merge["news_combined"][i]))  # replace n\ with ' '
    clean_news[i] = re.sub(r'[^\w\d\s\']+', '', clean_news[i]) # remove unwanted punctuation and \'


In [None]:
# Show first row in cleaned combined news column

clean_news[0]

In [None]:
# Add cleaned news column to merged data set

merge['news_cleaned'] = clean_news


merge['news_cleaned'][0]

In [None]:
# Show head of merged DataFrame

merge.head()

In [None]:
# Save merged DataFrame

merge.to_csv("azn__merge_cleaned_20210107.csv", sep=',', encoding='utf-8', header=True)

## 7. Sentiment Analysis

TextBlob will be used to get subjectivity and polarity scores for the cleaned and merged news data. Polarity is a float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement.

In [None]:
# Create function to get subjectivity
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

# Create function to get polarity
def getPolarity(text):
  return TextBlob(text).sentiment.polarity

In [None]:
# Create new columns
merge['subjectivity'] = merge['news_cleaned'].apply(getSubjectivity)
merge['polarity'] = merge['news_cleaned'].apply(getPolarity)

In [None]:
# Show new columns
merge.head(3)

In [None]:
# Show shape of DataFrame
merge.shape

In [None]:
# Save DataFrame with subjectivity and polarity scores
merge.to_csv("azn__merge_cleaned_subj_pol_20210107.csv", sep=',', encoding='utf-8', header=True)

## 9. Feature engineering

We will calculate whether the next day Adjusted Close price increased/held or decreased and label these as 1 and 0 respectively to build and train machine learning classifier models to predict price direction based on sentiment.

In [None]:
# Create copy of stock data

azn_stock_df_label = azn_stock_df.copy()
azn_stock_df_label

In [None]:
# "1" when AZN Adj Close value rose or stayed as the same;
# "0" when AZN Adj Close value decreased.

azn_stock_df_label['Adj Close Next'] = azn_stock_df_label['Adj Close'].shift(-1)
azn_stock_df_label['Label'] = azn_stock_df_label.apply(lambda x: 1 if (x['Adj Close Next']>= x['Adj Close']) else 0, axis =1)

azn_stock_df_label[['Adj Close', 'Adj Close Next', 'Label']].head(5)

In [None]:
# Save DataFrame

azn_stock_df_label.to_pickle("azn_stock_df_labels_20210107.pkl")

In [None]:
azn_stock_df_label.to_csv("azn_stock_df_label_20210107.csv", sep=',', encoding='utf-8', header=True)

In [None]:
# Show Adj Close Next and Label with Date

azn_stock_df_label_adj_nxt = azn_stock_df_label[['Adj Close Next', 'Label']]
azn_stock_df_label_adj_nxt

In [None]:
# Drop NaN row

azn_stock_df_label_adj_nxt = azn_stock_df_label_adj_nxt.dropna()
azn_stock_df_label_adj_nxt

In [None]:
# Merge DataFrames on date
merge2 = azn_stock_df.merge(azn_stock_df_label_adj_nxt, how='inner', left_index=True, right_index=True)

In [None]:
# Drop NaN row and show merged DataFrame
merge2 = merge2.dropna()
merge2

In [None]:
# Save DataFrame
merge2.to_csv("azn_prices_labels_20210107.csv", sep=',', encoding='utf-8', header=True)

In [None]:
merge2.to_pickle("azn_prices_labels_20210107.pkl")

In [None]:
# Merge next day Adjusted Close price and Label with combined stock data and sentiment DataFrame

merge3 = azn_stock_df_label_adj_nxt.merge(merge, how='inner', left_index=True, right_index=True)
merge3

In [None]:
# Save merged DataFrame

merge3.to_csv("azn_prices_labels_news_20210107.csv", sep=',', encoding='utf-8', header=True)

In [None]:
merge3.to_pickle("azn_prices_labels_news_20210107.pkl")

In [None]:
merge3 = pd.read_pickle("azn_prices_labels_news_20210107.pkl")

In [None]:
# Collapse data set to keep relevant stock price and sentiment score columns only

keep_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'subjectivity', 'polarity', 'compound', 'neg',	'neu',	'pos', 'Label']
df =  merge3[keep_columns]
df

## 10. Modelling

Split the data in feature matrix (X) and target vector (y).


In [None]:
# Create feature data set
X = df
X = np.array(X.drop(['Label'], 1))

# Create target data set
y = np.array(df['Label'])

We will split the data into train and test sets to verify predictions. Time series data cannot be split randomly as this would introduce look-ahead bias so the first 80% will be the training set and the last 20% the test set.

In [None]:
# Split data into 80% training and 20% testing data sets

split = int(0.8*len(df))

In [None]:
X_train = X[0:split]
y_train = y[0:split]

X_test = X[split:]
y_test = y[split:]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Create and train the model
model = LinearDiscriminantAnalysis().fit(X_train, y_train)

In [None]:
# Show model's predictions
predictions = model.predict(X_test)
predictions

In [None]:
# Show actual values
y_test

In [None]:
# Show model metrics
print(classification_report(y_test, predictions))

### Feature scaling

We will standardise the data using scikit-learn's preprocessing.scale() algorithm so that it is all on one scale.

In [None]:
# Standardise X's
X_train = scale(X_train)
X_test = scale(X_test)

### Create function for confusion matrix to visualise performance

In [None]:
# Function for confusion matrix

def plot_confusion_matrix(y_true, y_pred, labels=["Decrease", "Increase"],
                          normalize=False, title=None, cmap=plt.cm.coolwarm):

    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(12,6))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=labels, yticklabels=labels,
           title=title,
           ylabel='ACTUAL',
           xlabel='PREDICTED')
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 1.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="snow" if cm[i, j] > thresh else "orange",
                    size=26)
    ax.grid(False)
    fig.tight_layout()
    return ax

### Create dictionary of classifiers to train and predict on

In [None]:
# test models
models = {  'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
            'SVM Classification': SVC(),
            'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", max_iter=100),
            'KNeighborsClassifier':KNeighborsClassifier(n_neighbors=10),
            'GaussianProcessClassifier': GaussianProcessClassifier(),
            'RandomForestClassifier': RandomForestClassifier(n_estimators=100)
            }

for model_name in models.keys():

    model = models[model_name]
    print('\n'+'--------------',model_name,'---------------'+'\n')
    model.fit(X_train,y_train)
    # Plot confusion matrix
    plot_confusion_matrix(model.predict(X_test),y_test, title="Confusion Matrix")
    np.set_printoptions(precision=1)
    plt.show()
    # Final Classification Report
    print(classification_report(model.predict(X_test),y_test, target_names=['Decrease', 'Increase']))

### Conclusion

The precision score is the 'exactness', or ability of the model to return only relevant instances. When a model makes a prediction, how often it is correct?

It appears that the model which correctly predicted the increase in price most often was the Random Forest Classifier at 66%, and the K-Nearest Neighbours Classifier was best at predicting the decrease in price 63% of the time.

None of the scores were particularly outstanding and further improvements might include updating the lexicon with words and sentiments from other more specialised sources such as the [Loughran-McDonald Financial Sentiment Word Lists](https://sraf.nd.edu/textual-analysis/resources/#LM%20Sentiment%20Word%20Lists). This would likely result in more accurate sentiment analysis as it was specifically built for financial text whereas VADER is more attuned to sentiments expressed in social media.
