In [21]:
#Problem: Predicts if stock price of dow jones will increase or decrease 
#using top news headlines

In [22]:
pip install vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [23]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [24]:
#import the necessary libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [25]:
#load and store the datasets
df_reddit = pd.read_csv(r"C:\Users\kianh\finsights\finsights_ml\finsights_dow_dataset\RedditNews.csv")
df_price = pd.read_csv(r"C:\Users\kianh\finsights\finsights_ml\finsights_dow_dataset\upload_DJIA_table.csv")
df_cn = pd.read_csv(r"C:\Users\kianh\finsights\finsights_ml\finsights_dow_dataset\Combined_News_DJIA.csv")

In [26]:
#get the merged data set
df_comb = pd.read_csv(r"C:\Users\kianh\finsights\finsights_ml\finsights_dow_dataset\Combined_News_DJIA.csv")
df_merge = df_cn.merge(df_price, how='inner', on='Date')

In [27]:
#combine the top news headlines
headlines = []
for row in range(0, len(df_merge.index)):
    headlines.append(' '.join(str(x) for x in df_merge.iloc[row, 2:27]) )

In [28]:
#clean the data(remove b' and b")
clean_headlines = []
for i in range(0, len(headlines)):
    clean_headlines.append(re.sub("b[(')]", '', headlines[i]))
    clean_headlines[i] = re.sub('b[(")]', '',clean_headlines[i])
    clean_headlines[i] = re.sub("\'", '', clean_headlines[i])

In [29]:
#add the clean headlines to the merge data set
df_merge['Combined_News'] = clean_headlines
df_merge['Combined_News'][0]

'Georgia downs two Russian warplanes as countries move to brink of war" BREAKING: Musharraf to be impeached. Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube) Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire Afghan children raped with impunity, U.N. official says - this is sick, a three year old was raped and they do nothing" 150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets. Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SOs side" The enemy combatent trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO] Did the U.S. Prep Georgia for War with Russia? Rice Gives Green Light for Israel to Attack Iran: S

In [30]:
#Polarity = from -1 to 1 to see how positive or negative the statement is
#subjectibity = 0 t 1 on how objective the news article is
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [31]:
#Create 2 new columns
df_merge['Subjectivity'] = df_merge['Combined_News'].apply(getSubjectivity)
df_merge['Polarity'] = df_merge['Combined_News'].apply(getPolarity)

In [32]:
#Get the sentiment score
#Create a function to get the sentiment scores
def getSIA(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

In [33]:
#get the sentiment scores for each day
compound = []
neg = []
pos = []
neutral = []
SIA = 0

for i in range(0, len(df_merge['Combined_News'])):
    SIA = getSIA(df_merge['Combined_News'][i])
    compound.append(SIA['compound'])
    neg.append(SIA['neg'])
    neutral.append(SIA['neu'])
    pos.append(SIA['pos'])

In [34]:
#Store the sentiment scores in the merge data frame
df_merge['Compound'] = compound
df_merge['Negative'] = neg
df_merge['Neutral'] = neutral
df_merge['Positive'] = pos

In [35]:
#create a list of columns to keep
keep_cols = ['Open', 'High', 'Low', 'Volume', 'Subjectivity', 'Polarity', 'Compound', 'Negative', 'Neutral', 'Positive', 'Label']
#keep_cols is not in df_comb, so I need to fix this
df_merge = df_merge[keep_cols]

In [36]:
#----------RUN THIS--------------------
#create the feature data set
X = df_merge
X = np.array(X.drop(['Label'], 1))
#create the target data set
y = np.array(df_merge['Label'])

  X = np.array(X.drop(['Label'], 1))


In [37]:
#split the data into 80% training and 20% testing data sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [38]:
#create and train the model
model = LinearDiscriminantAnalysis().fit(x_train, y_train)

In [39]:
#show the models predictions
predictions = model.predict(x_test)

In [40]:
#show the performance of the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.79      0.83       193
           1       0.82      0.88      0.85       205

    accuracy                           0.84       398
   macro avg       0.84      0.84      0.84       398
weighted avg       0.84      0.84      0.84       398



In [41]:
#serialize into a file
import pickle

In [48]:
with open('dow_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [49]:
with open('dow_model.pkl', 'rb') as f:
     mp = pickle.load(f)

array([1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,