In [34]:
# deal with data
import pandas as pd
# deal with Nature Language Processing
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
#read data
data_newsAndPriceLabel = pd.read_csv("data/Combined_News_DJIA.csv")

In [36]:
# combine every day's 25 top headlines into one
data_newsAndPriceLabel["Combined Doc"] = data_newsAndPriceLabel.iloc[:,2:27].apply(lambda row: " ".join(str(x) for x in row), axis=1)

In [37]:
# seperate the dataset into trainset 2008-08-08~2014-08-06 and testset 2014-08-07~2016-07-01
trainset = data_newsAndPriceLabel[data_newsAndPriceLabel["Date"]<"2014-08-07"]
testset = data_newsAndPriceLabel[data_newsAndPriceLabel["Date"]>="2014-08-07"]

In [38]:
# convert a collection of text documents to a matrix of token counts
obj_CountVectorizer = CountVectorizer(ngram_range=(2,2))
trainset_tokenCounts = obj_CountVectorizer.fit_transform(trainset["Combined Doc"])
testset_tokenCounts = obj_CountVectorizer.transform(testset["Combined Doc"])

In [39]:
# train a model
# Logistic Regression
from sklearn.linear_model import LogisticRegression
obj_LogisticRegression = LogisticRegression()
obj_LogisticRegression = obj_LogisticRegression.fit(trainset_tokenCounts, trainset["Label"])

In [40]:
# use the model to predict the increase/decrease of stock
predictions = obj_LogisticRegression.predict(testset_tokenCounts)

In [41]:
# show the accuracy of the model
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,88,142
1,58,192


In [42]:
# draw ROC(Receiver Operating Characteristic) curve and calculate AUC(Area under Curve)
from sklearn import metrics
probs = obj_LogisticRegression.predict_proba(testset_tokenCounts)
preds = probs[:,1]
fpr, tpr, threshold = fpr, tpr, threshold = metrics.roc_curve(testset["Label"], preds)
roc_auc = metrics.auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [43]:
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
obj_BernoulliNB = BernoulliNB()
obj_BernoulliNB = obj_BernoulliNB.fit(trainset_tokenCounts, trainset["Label"])

In [44]:
predictions = obj_BernoulliNB.predict(testset_tokenCounts)

In [45]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,1
Actual,Unnamed: 1_level_1
0,230
1,250


In [46]:
probs = obj_BernoulliNB.predict_proba(testset_tokenCounts)
preds = probs[:,1]
fpr, tpr, threshold = fpr, tpr, threshold = metrics.roc_curve(testset["Label"], preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [47]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
obj_MultinomialNB = MultinomialNB()
obj_MultinomialNB = obj_MultinomialNB.fit(trainset_tokenCounts, trainset["Label"])

In [48]:
predictions = obj_MultinomialNB.predict(testset_tokenCounts)

In [49]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,205
1,27,223


In [50]:
probs = obj_MultinomialNB.predict_proba(testset_tokenCounts)
preds = probs[:,1]
fpr, tpr, threshold = fpr, tpr, threshold = metrics.roc_curve(testset["Label"], preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()