# Read the data

In [1]:
# deal with data
import pandas as pd
# deal with Nature Language Processing
from sklearn.feature_extraction.text import CountVectorizer

#read data
data_newsAndPriceLabel = pd.read_csv("data/Combined_News_DJIA.csv")

# combine every day's 25 top headlines into one
data_newsAndPriceLabel["Combined Doc"] = data_newsAndPriceLabel.iloc[:,2:27].apply(lambda row: " ".join(str(x) for x in row), axis=1)

# Seperate data into train set and test set

In [2]:
# seperate the dataset into trainset 2008-08-08~2014-08-06 and testset 2014-08-07~2016-07-01
trainset = data_newsAndPriceLabel[data_newsAndPriceLabel["Date"]<"2014-08-07"]
testset = data_newsAndPriceLabel[data_newsAndPriceLabel["Date"]>="2014-08-07"]

# Using NLP(Natrual Language Processing) to process documents
## every word is a feature itself, and stop words are eliminated

In [3]:
# convert a collection of text documents to a matrix of token counts
obj_CountVectorizer = CountVectorizer(stop_words="english")
trainset_tokenCounts = obj_CountVectorizer.fit_transform(trainset["Combined Doc"])
testset_tokenCounts = obj_CountVectorizer.transform(testset["Combined Doc"])

# Logistic Regression Model

In [4]:
# train a model
# Logistic Regression
from sklearn.linear_model import LogisticRegression
obj_LogisticRegression = LogisticRegression()
obj_LogisticRegression = obj_LogisticRegression.fit(trainset_tokenCounts, trainset["Label"])

In [5]:
# use the model to predict the increase/decrease of stock
predictions = obj_LogisticRegression.predict(testset_tokenCounts)

## Use cross table to take a look at the prediction

In [6]:
# show the accuracy of the model
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,83,147
1,114,136


In [7]:
# draw ROC(Receiver Operating Characteristic) curve and calculate AUC(Area under Curve)
from sklearn import metrics
import matplotlib.pyplot as plt

def draw_ROC_curve(obj_model, testset_tokenCounts):
    probs = obj_model.predict_proba(testset_tokenCounts)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(testset["Label"], preds)
    roc_auc = metrics.auc(fpr, tpr)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    return

## Draw ROC curve(Receiver operating characteristic)

In [8]:
draw_ROC_curve(obj_LogisticRegression, testset_tokenCounts)

# Bernouli Naive Bayes Model

In [9]:
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
obj_BernoulliNB = BernoulliNB()
obj_BernoulliNB = obj_BernoulliNB.fit(trainset_tokenCounts, trainset["Label"])

In [10]:
predictions = obj_BernoulliNB.predict(testset_tokenCounts)

In [11]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43,187
1,55,195


In [12]:
draw_ROC_curve(obj_BernoulliNB, testset_tokenCounts)

# Multinomial Naive Bayes Model

In [13]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
obj_MultinomialNB = MultinomialNB()
obj_MultinomialNB = obj_MultinomialNB.fit(trainset_tokenCounts, trainset["Label"])

In [14]:
predictions = obj_MultinomialNB.predict(testset_tokenCounts)

In [15]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,51,179
1,61,189


In [16]:
draw_ROC_curve(obj_MultinomialNB, testset_tokenCounts)