In [1]:
import nltk 
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
# Used to convert words into the base forms (dogs and dog becomes the same word)
wordnet_lemmatizer = WordNetLemmatizer() 

In [3]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [4]:
df = pd.read_csv("Reddit_Data.csv")
df = df.dropna() 
df = df.head(15000)
# df

In [5]:
# Loops through each comment and tokenize it and remove stop words. Also, create word index map to compute word frequencies
def tokenize_comments(df):
    tokens_list = [] 
    word_index_map = {}
    i=0
    for index, row in df.iterrows():
        tokens = nltk.tokenize.word_tokenize(row["clean_comment"])# Tokenize the comments
        tokens = [t for t in tokens if len(t) > 2]
        tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # Convert words into their base form
        tokens = [t for t in tokens if t not in stopwords] # Only adds words to the tokens if they are not stopwords and the length of the string is > 2
        for token in tokens: # Loop through and get add each token/word to word index map
            if token not in word_index_map:
                word_index_map[token] = i
                i += 1
        tokens_list.append(tokens)
    return tokens_list, word_index_map # return tokenize list and the word index map

In [6]:
# Structure and normalize each tokens list
# np.seterr(divide = 'ignore') # Ignore divide by zero warning
def normalize_tokens(tokens_list, word_index_map):
    normalized_tokens_list = []
    for token_list in tokens_list: # Loop each tokens list (comment that has been tokenized)
        x = np.zeros(len(word_index_map) + 1)
        for token in token_list: # Loop each token in the comment 
            i = word_index_map[token] # Get (first) occurence of word. Arbritray number but needed for organization
            x[i] += 1 # Increment
        x = np.divide(x,x.sum()) # Divide the frequency vector by total sum allowing for us to investigate it words total usage in the comment
        normalized_tokens_list.append(x)
    return normalized_tokens_list

In [7]:
def attach_labels(tokens_list, labels):
    for i in range(len(tokens_list)):
        tokens_list[i] = np.append(tokens_list[i], np.array(labels)[i])
    return tokens_list

In [8]:
# Tokenize comments
tokens_list, word_index_map = tokenize_comments(df)

In [9]:
# Normalize token
normalized_tokens = normalize_tokens(tokens_list, word_index_map)

  # Remove the CWD from sys.path while we load stuff.


In [10]:
# # Attach the sentiment labels back to its respective comment
new_tokens_list = attach_labels(normalized_tokens, df["category"])
# new_tokens_list

In [11]:
# Convert the list of np arrays into np mats and then into dataframe
df = pd.DataFrame(np.mat(normalized_tokens))

In [12]:
# Drop na values
df = df.dropna(axis=0)

In [13]:
df_target = df[df.columns[-1]] # Extract target variable 
df = df.drop(df.columns[-1], axis = 1) # Drop target column from data

In [14]:
# Split the data
from sklearn.model_selection import train_test_split
df_train, df_test, df_train_target, df_test_target = train_test_split(df, df_target, test_size=0.2, random_state=33)

In [15]:
from sklearn.svm import SVC
sentiment_analyzer = LogisticRegression()
sentiment_analyzer.fit(df_train, df_train_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
pd.DataFrame(sentiment_analyzer.predict(df_test)).iloc[18]

0    1.0
Name: 18, dtype: float64

In [17]:
print("Classification rate", sentiment_analyzer.score(df_test, df_test_target))

Classification rate 0.5908479138627187


In [18]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29769,29770,29771,29772,29773,29774,29775,29776,29777,29778
1418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
data = pd.read_csv("Combined_News_DJIA.csv")
print(data.shape)

(1989, 27)


In [20]:
data = data.dropna()
print(data.shape)
data = data.reset_index(drop=True) # Reindex the dataframe after dropping column

(1986, 27)


In [21]:
# Loop through each column and replace the unneccessary tag 
for column in data.columns[2:]: 
    data[column] = data[column].str.replace('b.', ' ', regex=True)
# data

In [22]:
def tokenize_string(input_str):
    tokens = nltk.tokenize.word_tokenize(input_str) # Tokenize string
    tokens = [t for t in tokens if len(t) > 2] # Remove words less than 2 chars
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # Convert words into their base form
    tokens = [t for t in tokens if t not in stopwords] # Only adds words to the tokens if they are not stopwords and the length of the string is > 2
#     for token in tokens: # Loop through and add each token/word to word index map if not already added
#             if token not in word_index_map:
#                 word_index_map[token] = i
#                 i += 1
    return tokens

In [23]:
def normalize_tokenized_string(tokens):
    x = np.zeros(len(word_index_map) + 1)
    for token in tokens: # Loop each token in the comment 
        if token in word_index_map: # We can't analyze words that we haven't used for training
            i = word_index_map[token] # Get (first) occurence of word. Arbritray number but needed for organization
            x[i] += 1
    if x.sum() != 0: # No words are in this headline have sentiment analyzer been trained on
        x = np.divide(x,x.sum())
    return x

In [24]:
# Reconstruct the reddit dataframe to be represented by it's entries sentiment label
def reconstruct(data):
    mat = np.zeros(shape=(data.shape[0], data.shape[1]-2)) # Create matrix to hold the data
    position = 0 # Pointer for our data
    for index, row in data.iterrows():
        predictions = [] # The current row's prediction list (i.e. That day's top 25 headlines)
        for i in range(2, len(data.columns)): # Loop through each entry in the current row
            current_comment = data[data.columns[i]][index] # Get the current comment in the row
            tokens = tokenize_string(current_comment) # Tokenize the comment
            normalized_tokens = normalize_tokenized_string(tokens) # Normalize the tokens by frequency
            prediction = sentiment_analyzer.predict([normalized_tokens]) # Predict the sentiment label for this entry 
            predictions.append(prediction[0]) # Add the label to the vector
        mat[position, :] = predictions # Add the vector (the row's sentiment label representation) into the matrix
        position += 1 # Increment row position
    return mat

In [25]:
# Reconstruct the original reddit dataset to be represented by its
# entries sentiment label and then convert the matrix it into a dataframe 
df2 = pd.DataFrame(reconstruct(data)) 
df2 # Display new dataframe 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
1,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
3,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1982,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1983,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1984,1.0,-1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0


<h4>Now, let's take the reconstructed dataframe and create a classification model that predicts if the DJIA close value increases or decreases from the opening value.<h4>

In [26]:
# First, let's finish reconstructing the dataset by adding the DJIA 

In [27]:
# df2["Label"] = data["Label"]
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
1,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
3,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1982,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1983,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1984,1.0,-1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0


In [40]:
# Split the data
df2_train, df2_test, df2_train_target, df2_test_target = train_test_split(df2, data["Label"], test_size=0.3, random_state=33)

In [66]:
# Let's create logistic regression classifer to the behavior of the DJIA closing value
from sklearn.linear_model import LogisticRegression
lrclf = LogisticRegression()
lrclf.fit(df2_train, df2_train_target)
lrpreds_test = lrclf.predict(df2_test)
# Run 10-fold validation
cv_scores = cross_val_score(lrclf, df2_test, df2_test_target, cv=10)
print("Accuracy for logistic regression classifier")
print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
print("Score on Training: ", lrclf.score(df2_train, df2_train_target))
print("Score on Test: ", lrclf.score(df2_test, df2_test_target))

Accuracy for logistic regression classifier
Overall Accuracy on X-Val: 0.50 (+/- 0.12)
Score on Training:  0.5597122302158274
Score on Test:  0.4949664429530201


In [73]:
# Create decision tree classifier and train the classifier
from sklearn import tree
treeclf = tree.DecisionTreeClassifier(criterion='gini')
treeclf = treeclf.fit(df2_train, df2_train_target)
# Compute predictions
treepreds_test = treeclf.predict(df2_test)
# Run 10-fold validation
cv_scores = cross_val_score(treeclf, df2_train, df2_train_target, cv=10)
print("Accuracy for decision tree classifier")
print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
print("Score on Training: ", treeclf.score(df2_train, df2_train_target))
print("Score on Test: ", treeclf.score(df2_test, df2_test_target))
## Possibly overfitting the training set ##

Accuracy for decision tree classifier
Overall Accuracy on X-Val: 0.53 (+/- 0.05)
Score on Training:  0.8906474820143885
Score on Test:  0.511744966442953


In [72]:
#  Create linear discriminant analysis classifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
ldclf = LinearDiscriminantAnalysis()
# Compute predictions
ldclf = ldclf.fit(df2_train, df2_train_target)
ldpreds_test = ldclf.predict(df2_test)
# Run 10-fold validation
cv_scores = cross_val_score(ldclf, df2_test, df2_test_target, cv=10)
print("Accuracy for linear discriminant analysis classifier")
print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
print("Score on Training: ", ldclf.score(df2_train, df2_train_target))
print("Score on Test: ", ldclf.score(df2_test, df2_test_target))

Accuracy for linear discriminant analysis classifier
Overall Accuracy on X-Val: 0.50 (+/- 0.11)
Score on Training:  0.560431654676259
Score on Test:  0.49161073825503354


In [None]:
# Create SVM classifier (Linear)
from sklearn.svm import SVC # Support vector classifier
svm_linear = SVC(kernel='linear', C=1E10)
svm_linear.fit(df2_train, df2_train_target)
cv_scores = cross_val_score(svm_linear, df2_test, df2_test_target, cv=10)
print("Accuracy for support vector machine classifier")
print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
print("Score on Training: ", svm_linear.score(df2_train, df2_train_target))
print("Score on Test: ", svm_linear.score(df2_test, df2_test_target))

In [None]:
# Create a kernelized SVM by using RBF (radial basis function)
svm_k = SVC(kernel='rbf', C=1E6)
svm_k.fit(X, y)
cv_scores = cross_val_score(svm_k, df2_test, df2_test_target, cv=10)
print("Accuracy for support vector machine classifier")
print("Overall Accuracy on X-Val: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
print("Score on Training: ", svm_k.score(df2_train, df2_train_target))
print("Score on Test: ", svm_k.score(df2_test, df2_test_target))