In [1]:
from __future__ import division
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score, roc_curve, auc
from nltk.stem.porter import PorterStemmer
from os import path
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from sklearn.cross_validation import train_test_split
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
#matplotlib.rcParams["figure.figsize"] = "8, 8"



# Data preprocessing

In [2]:
data = pd.read_csv('./stocknews/Combined_News_DJIA.csv')
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

In [3]:

#Training set 

#nested dictionary 1611 dictionaries (days) with all the headlines as 1 string
trainlist = []
for row in range(0,len(train.index)):
     trainlist.append(' '.join(str(x) for x in train.iloc[row,2:27])) #from column 2 to 27 because column 0 is the date
     # and column 1 is the label '0' or '1'
#Remove all digits
for day in range(len(trainlist)):
    trainlist[day] = ''.join([i for i in trainlist[day] if not i.isdigit()])

#choose range for n-gram
minrange = 2
maxrange = 2
print 'The minimum range for the n-gram is', minrange, 'and the maximum range is', maxrange

#define the CountVectorizer function
vectorizer = CountVectorizer(ngram_range=(minrange, maxrange))
#make the document-term matrix training headlines
trainvec = vectorizer.fit_transform(trainlist)
print 'The shape of the train term-document matrix is', trainvec.shape

#Test set

testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))

for day in range(len(testheadlines)):
    testheadlines[day] = ''.join([i for i in testheadlines[day] if not i.isdigit()])

#make the document-term matrix for test headlines
testvec = vectorizer.transform(testheadlines)

print 'The shape of the test term-document matrix is', testvec.shape

The minimum range for the n-gram is 2 and the maximum range is 2
The shape of the train term-document matrix is (1611, 357779)
The shape of the test term-document matrix is (378, 357779)


# Logistic Regression

In [None]:
#Define the logistic regression function
model_LR = LogisticRegression()
model_LR = model_LR.fit(trainvec, train["Label"])

predictions_LR = model_LR.predict(testvec)

accuracy_LR = accuracy_score(test["Label"], predictions_LR)
print 'The accuracy of logistic regression is ',accuracy_LR

fpr, tpr, tresholds = roc_curve(test["Label"], predictions_LR)
tmp = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

The accuracy of logistic regression is  0.57671957672


# kNN

In [None]:
#ngram 2,3 k 97, acc 55,5
#ngram 2,2 k 210, acc 55,0
#ngram 1,1 k 87, acc 54,5
#ngram 1,3 k 7, acc 54,49

#stopwords, ngram 1,1, k=28, acc 56,9

In [None]:
#nested dictionary 1611 dictionaries (days) with all the headlines as 1 string
accuracy_knn_list = []
k_counter = []
accuracy_knn_fin = 0
for neighbors in range(1,500):
    model_knn = KNeighborsClassifier(neighbors)
    model_knn = model_knn.fit(trainvec, train["Label"])

    predictions_knn = model_knn.predict(testvec)
    accuracy_knn = accuracy_score(test["Label"], predictions_knn)
    accuracy_knn_list = np.append(accuracy_knn_list, accuracy_knn)
    k_counter = np.append(k_counter, neighbors)
    if accuracy_knn > accuracy_knn_fin:
        accuracy_knn_fin = accuracy_knn
        best_k = neighbors

print 'The plot of accuracy vs iterations gives:'
plt.plot(k_counter, accuracy_knn_list)
plt.ylabel('Accuracy')
plt.xlabel('K-value')
#plt.axis([0, iterations, 70, 100])
plt.show()

print 'The highest accuracy of kNN with k =', best_k, 'is', accuracy_knn_fin


# code for finding k is above, this code is used to plot the roc curve
model_knn2 = KNeighborsClassifier(215)
model_knn2 = model_knn2.fit(trainvec, train["Label"])

predictions_knn2 = model_knn2.predict(advancedtest)

accuracy_knn2 = accuracy_score(test["Label"], predictions_knn2)
print accuracy_knn2
fpr, tpr, tresholds = roc_curve(test["Label"], predictions_knn2)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

The plot of accuracy vs iterations gives:


In [None]:
#Wordcloud
#Seperate the data based on their labels
data2['Combined']=data.iloc[:,2:27].apply(lambda row: ''.join(str(row.values)), axis=1)
non_decrease = train[train['Label']==1]
decrease = train[train['Label']==0]

#make a list for both classes
nondecreaselist = []
for row in range(0,len(non_decrease.index)):
    nondecreaselist.append(' '.join(str(x) for x in non_decrease.iloc[row,2:27]))
nondecreasestring = '. ' .join(nondecreaselist)

decreaselist = []
for row in range(0,len(decrease.index)):
    decreaselist.append(' '.join(str(x) for x in decrease.iloc[row,2:27]))
decreasestring = '. ' .join(decreaselist)

#make wordclouds
wordcloud1 = WordCloud(background_color='black',
                      width=3000,
                      height=2500
                     ).generate(nondecreasestring)

plt.figure(1,figsize=(8,8))
plt.imshow(wordcloud1)
plt.axis('off')
plt.show()

In [None]:
#make the second wordcloud
wordcloud2 = WordCloud(background_color='white',
                      width=3000,
                      height=2500
                     ).generate(decreasestring)

plt.figure(1,figsize=(8,8))
plt.imshow(wordcloud2)
plt.axis('off')
plt.show()

In [None]:
#For the coefficients of the words
wordsLR = advancedvectorizer.get_feature_names()
coeffLR = advancedmodel_LR.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Words' : wordsLR, 
                        'Coefficient' : coeffLR})
coeffdf = coeffdf.sort_values(['Coefficient', 'Words'], ascending=[0, 1])
coeffdf.head(10)

In [None]:
coeffdf.tail(10)