In [34]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path
import datetime as dt
from sklearn import preprocessing
%matplotlib inline

In [35]:
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(2)

In [84]:
# Import the dataset
file_path = Path("../Stephan/crypto.csv")
eth_reddit_df = pd.read_csv(file_path, parse_dates=True, infer_datetime_format=True)
eth_reddit_df = eth_reddit_df.sort_index()
eth_reddit_df

Unnamed: 0,comment,sentiment
0,This is what peak human performance looks like,1
1,Mining is really bad for the environment,1
2,Welcome back boys. $2500 is here,1
3,*Crypto,1
4,easy to say,1
5,Sold all my BTC and alts to go all in on ETH!,1
6,"Probably because they want to buy it cheaper, ...",1
7,FINALLY my broke ass has 1 ETH,1
8,What did you think it was gonna do?,1
9,I don't want lambos or luxury things. I just w...,1


In [86]:
#eth_reddit_df = eth_reddit_df[eth_reddit_df.Market != 'Neutral']
#eth_reddit_df.tail(100)

In [85]:
from sklearn.preprocessing import LabelEncoder
#Create the LabelEncoder
le = LabelEncoder()

In [87]:
le.fit(eth_reddit_df['sentiment'])
eth_reddit_df['sentiment'] = le.transform(eth_reddit_df['sentiment'])

In [88]:
eth_reddit_df.tail(100)

Unnamed: 0,comment,sentiment
1838,CoinDesk: Ethereum Foundation Strikes Deal wit...,1
1839,Lawyer here. Zero knowledge. Felt tingling sen...,1
1840,MyEtherWallet Ether Cards Are Now Available,1
1841,Jaxx mobile hacked.. 973 eth gone. AMA,1
1842,Solidity version 0.4.16 released,1
1843,Be careful not to get scammed with sob stories...,1
1844,"$44 Million in Ethereum Moved With $0.13 Fee, ...",1
1845,A proposal to issue crypto tokens would make t...,1
1846,Porn star Janice Griffith Tweets Ethereum adop...,1
1847,"The 0x team donated ~$12,000 to Etherscan and ...",1


In [89]:
eth_reddit_df.index = pd.to_datetime(eth_reddit_df.index)

In [91]:
X = eth_reddit_df["comment"].values
y = eth_reddit_df["sentiment"].values

In [92]:
eth_reddit_df['sentiment'].value_counts()

1    1176
0     762
Name: sentiment, dtype: int64

In [93]:
# Import the Tokenizer method from Keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [94]:
# Create an instance of the Tokenizer and fit it with the X text data
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(X)

In [95]:
y.shape

(1938,)

In [96]:
# Print the first five elements of the encoded vocabulary
for token in list(tokenizer.word_index)[:5]:
    print(f"word: '{token}', token: {tokenizer.word_index[token]}")

word: 'the', token: 1
word: 'ethereum', token: 2
word: 'to', token: 3
word: 'a', token: 4
word: 'is', token: 5


In [97]:
# Transform the text data to numerical sequences
X_seq = tokenizer.texts_to_sequences(X)

In [98]:
# Contrast a sample numerical sequence with its text version
print("**Text comment**")
print({X[0]})

**Text comment**
{'This is what peak human performance looks like'}


In [99]:
print("**Numerical sequence representation**")
print(X_seq[0])

**Numerical sequence representation**
[13, 5, 47, 1969, 734, 1304, 304, 58]


In [100]:
# Import the pad_sequences method from Keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [101]:
from sklearn.model_selection import train_test_split

In [102]:
# Set the pad size
max_words = 140

# Pad the sequences using the pad_sequences() method
X_pad = pad_sequences(X_seq, maxlen=max_words, padding="post")

In [103]:
# Creating training, validation, and testing sets using the encoded data

X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(X_pad, y)

#X_train_rnn, X_val_rnn, y_train_rnn, y_val_rnn = train_test_split(X_train_rnn, y_train_rnn)

In [104]:
y_train_rnn

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [105]:
# Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [106]:
# Model set-up
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
embedding_size = 64

In [107]:
# Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=280))

# Output layer
model.add(Dense(units=1, activation="sigmoid"))

In [108]:
# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy",
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)

In [109]:
# Show model summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 140, 64)           282624    
_________________________________________________________________
lstm_2 (LSTM)                (None, 280)               386400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 281       
Total params: 669,305
Trainable params: 669,305
Non-trainable params: 0
_________________________________________________________________


In [110]:
X_train_rnn

array([[ 128, 1119,    3, ...,    0,    0,    0],
       [   4,   32, 1253, ...,    0,    0,    0],
       [ 627, 2141,    0, ...,    0,    0,    0],
       ...,
       [   2,  478, 1101, ...,    0,    0,    0],
       [2763,    3,  325, ...,    0,    0,    0],
       [  70,   26,   43, ...,    0,    0,    0]])

In [111]:
# Training the model
batch_size = 25
epochs = 10
model.fit(
    X_train_rnn,
    y_train_rnn,
    #validation_data=(X_val_rnn, y_val_rnn),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x227aa911988>

In [120]:
# Predict classes using the testing data
y_rnn_pred = model.predict_classes(X_test_rnn, batch_size=25)



In [121]:
# Accuracy
from sklearn.metrics import accuracy_score

#print("Vader Accuracy: %.2f" % (accuracy_score(y_test, y_vader_pred)))
#print("RNN LSTM Accuracy %.2f" % (accuracy_score(y_test_rnn, y_rnn_pred)))

In [122]:
# Import the confusion_matrix method from sklearn
from sklearn.metrics import confusion_matrix

In [123]:
# Confusion matrtix metrics from the RNN LSTM model
tn_rnn, fp_rnn, fn_rnn, tp_rnn = confusion_matrix(y_test_rnn, y_rnn_pred).ravel()

# Dataframe to display confusion matrix from the RNN LSTM model
cm_rnn_df = pd.DataFrame(
    {
        "Positive(1)": [f"TP={tp_rnn}", f"FP={fp_rnn}"],
        "Negative(0)": [f"FN={fn_rnn}", f"TN={tn_rnn}"],
    },
    index=["Positive(1)", "Negative(0)"],
)
cm_rnn_df.index.name = "Actual"
cm_rnn_df.columns.name = "Predicted"
print("Confusion Matrix from the RNN LSTM Model")
display(cm_rnn_df)

Confusion Matrix from the RNN LSTM Model


Predicted,Positive(1),Negative(0)
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Positive(1),TP=302,FN=0
Negative(0),FP=183,TN=0


In [124]:
# Import the classification_report method from sklearn
from sklearn.metrics import classification_report

In [125]:
# Display classification report for the RNN LSTM Model
print("Classification Report for the RNN LSTM Model")
print(classification_report(y_rnn_pred, y_test_rnn))

Classification Report for the RNN LSTM Model
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.62      0.77       485

    accuracy                           0.62       485
   macro avg       0.50      0.31      0.38       485
weighted avg       1.00      0.62      0.77       485



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
