# Take an overview of the data

In [10]:
# deal with data
import pandas as pd
# deal with Nature Language Processing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
#read data
data_newsAndPriceLabel = pd.read_csv("data/Combined_News_DJIA.csv")

In [4]:
# combine every day's 25 top headlines into one
data_newsAndPriceLabel["Combined Doc"] = data_newsAndPriceLabel.iloc[:,2:27].apply(lambda row: " ".join(str(x) for x in row), axis=1)

# Seperate data into train set and test set

In [5]:
# seperate the dataset into trainset 2008-08-08~2014-08-06 and testset 2014-08-07~2016-07-01
trainset = data_newsAndPriceLabel[data_newsAndPriceLabel["Date"]<"2014-08-07"]
testset = data_newsAndPriceLabel[data_newsAndPriceLabel["Date"]>="2014-08-07"]

# Using NLP(Natrual Language Processing) to process documents
## every word is a feature itself, stop words are eliminated, and TFIDF in introduced

In [11]:
# convert a collection of text documents to a matrix of token Tfidf values
obj_TfidfVectorizer = TfidfVectorizer(stop_words="english")
trainset_tokenTfidf = obj_TfidfVectorizer.fit_transform(trainset["Combined Doc"])
testset_tokenTfidf = obj_TfidfVectorizer.transform(testset["Combined Doc"])

# Logistic Regression Model

In [14]:
# train a model
# Logistic Regression
from sklearn.linear_model import LogisticRegression
obj_LogisticRegression = LogisticRegression()
obj_LogisticRegression = obj_LogisticRegression.fit(trainset_tokenTfidf, trainset["Label"])

In [15]:
# use the model to predict the increase/decrease of stock
predictions = obj_LogisticRegression.predict(testset_tokenTfidf)

## Use cross table to take a look at the prediction

In [22]:
# show the accuracy of the model
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17,213
1,24,226


In [23]:
# draw ROC(Receiver Operating Characteristic) curve and calculate AUC(Area under Curve)
from sklearn import metrics
import matplotlib.pyplot as plt

def draw_ROC_curve(obj_model, testset_tokenTfidf):
    probs = obj_model.predict_proba(testset_tokenTfidf)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(testset["Label"], preds)
    roc_auc = metrics.auc(fpr, tpr)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    return

## Draw ROC curve(Receiver operating characteristic)

In [25]:
draw_ROC_curve(obj_LogisticRegression, testset_tokenTfidf)

# Bernouli Naive Bayes Model

In [27]:
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
obj_BernoulliNB = BernoulliNB()
obj_BernoulliNB = obj_BernoulliNB.fit(trainset_tokenTfidf, trainset["Label"])

In [28]:
predictions = obj_BernoulliNB.predict(testset_tokenTfidf)

In [29]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43,187
1,55,195


In [30]:
draw_ROC_curve(obj_BernoulliNB, testset_tokenTfidf)

# Multinomial Naive Bayes Model

In [31]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
obj_MultinomialNB = MultinomialNB()
obj_MultinomialNB = obj_MultinomialNB.fit(trainset_tokenTfidf, trainset["Label"])

In [32]:
predictions = obj_MultinomialNB.predict(testset_tokenTfidf)

In [33]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,1
Actual,Unnamed: 1_level_1
0,230
1,250


In [34]:
draw_ROC_curve(obj_MultinomialNB, testset_tokenTfidf)

# KNeighborsClassifier

In [35]:
from sklearn.neighbors import KNeighborsClassifier
obj_KNeighborsClassifier = KNeighborsClassifier()
obj_KNeighborsClassifier = obj_KNeighborsClassifier.fit(trainset_tokenTfidf, trainset["Label"])

In [36]:
predictions = obj_KNeighborsClassifier.predict(testset_tokenTfidf)

In [37]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,137
1,96,154


In [39]:
draw_ROC_curve(obj_KNeighborsClassifier, testset_tokenTfidf)

# DecisionTreeClassifier

In [40]:
from sklearn.tree import DecisionTreeClassifier
obj_DecisionTreeClassifier = DecisionTreeClassifier()
obj_DecisionTreeClassifier = obj_DecisionTreeClassifier.fit(trainset_tokenTfidf, trainset["Label"])

In [41]:
predictions = obj_DecisionTreeClassifier.predict(testset_tokenTfidf)

In [42]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,101,129
1,101,149


In [43]:
draw_ROC_curve(obj_DecisionTreeClassifier, testset_tokenTfidf)

# RandomForestClassifier

In [44]:
from sklearn.ensemble import RandomForestClassifier
obj_RandomForestClassifier = RandomForestClassifier()
obj_RandomForestClassifier = obj_RandomForestClassifier.fit(trainset_tokenTfidf, trainset["Label"])

In [45]:
predictions = obj_RandomForestClassifier.predict(testset_tokenTfidf)

In [46]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,132,98
1,145,105


In [47]:
draw_ROC_curve(obj_RandomForestClassifier, testset_tokenTfidf)

# AdaBoostClassifier

In [48]:
from sklearn.ensemble import AdaBoostClassifier
obj_AdaBoostClassifier = AdaBoostClassifier()
obj_AdaBoostClassifier = obj_AdaBoostClassifier.fit(trainset_tokenTfidf, trainset["Label"])

In [49]:
predictions = obj_AdaBoostClassifier.predict(testset_tokenTfidf)

In [50]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,85,145
1,105,145


In [51]:
draw_ROC_curve(obj_AdaBoostClassifier, testset_tokenTfidf)

# SVC

In [62]:
from sklearn.svm import SVC
obj_SVC = SVC(probability=True)
obj_SVC= obj_SVC.fit(trainset_tokenTfidf, trainset["Label"])

In [63]:
predictions = obj_SVC.predict(testset_tokenTfidf)

In [64]:
pd.crosstab(testset["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,1
Actual,Unnamed: 1_level_1
0,230
1,250


In [65]:
draw_ROC_curve(obj_SVC, testset_tokenTfidf)