# IMPORTING DATA

In [1]:
from keras.datasets import imdb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import os

imdb_dir = 'aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname),encoding="utf-8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100  # We will cut reviews after 100 words
training_samples = 200  # We will be training on 200 samples
validation_samples = 10000  # We will be validating on 10000 samples
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [4]:
vocabulary_size = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Loaded dataset with 25000 training samples, 25000 test samples


In [5]:
X_test

array([list([1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5, 4, 360, 7, 4, 177, 2, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 2, 100, 28, 1668, 14, 31, 23, 27, 2, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 2, 38, 32, 25, 2, 451, 202, 14, 6, 717]),
       list([1, 14, 22, 3443, 6, 176, 7, 2, 88, 12, 2679, 23, 1310, 5, 109, 943, 4, 114, 9, 55, 606, 5, 111, 7, 4, 139, 193, 273, 23, 4, 172, 270, 11, 2, 2, 4, 2, 2801, 109, 1603, 21, 4, 22, 3861, 8, 6, 1193, 1330, 10, 10, 4, 105, 987, 35, 841, 2, 19, 861, 1074, 5, 1987, 2, 45, 55, 221, 15, 670, 2, 526, 14, 1069, 4, 405, 5, 2438, 7, 27, 85, 108, 131, 4, 2, 2, 3884, 405, 9, 3523, 133, 5, 50, 13, 104, 51, 66, 166, 14, 22, 157, 9, 4, 530, 239, 34, 2, 2801, 45, 407, 31, 7, 41, 3778, 105, 21, 59, 299, 12, 38, 950, 5, 4521, 15, 45, 629, 488, 2733, 127, 6, 52, 292, 17, 4, 2, 185, 132, 1988, 2, 1799, 488, 2693, 47, 6, 392, 173, 4, 2, 4378, 270, 2352, 4, 1500, 7, 4, 65, 55, 73, 11, 346, 14, 20, 9, 6, 9

In [6]:
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

---review with words---
['the', 'and', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'and', 'and', 'br', 'villain', 'and', 'and', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'and', 'and', 'concept', 'issue', 'and', 'to', "god's", 'he', 'is', 'and', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'and', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', 'and', 'and', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'and', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'and', 'things', 'is', 'far', 'this', 'make', 'mistakes', 'and', 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'and', 'movies', 'get', 'are', 'and', 'br', 'yes', 'female', 'just', 'its', 'because', 'many', 'br', 'of', 'overly', 'to', 'descent', 'people', 'time', 

In [7]:
print('Maximum review length: {}'.format(
len(max((X_train + X_test), key=len))))

Maximum review length: 2697


In [8]:
print('Minimum review length: {}'.format(
len(min((X_test + X_test), key=len))))

Minimum review length: 14


In [9]:
from keras.preprocessing import sequence
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [10]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [12]:
batch_size = 128
num_epochs = 10
X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]
model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

Train on 24872 samples, validate on 128 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23a0dd07978>

In [13]:
model.history

<keras.callbacks.History at 0x23a0dd07978>

In [14]:
#Validation

In [15]:
predictions=model.predict(X_valid)
predictions



array([[9.82431948e-01],
       [1.75283942e-02],
       [2.54951455e-02],
       [7.55443633e-01],
       [3.44287008e-02],
       [2.03910261e-01],
       [9.51553464e-01],
       [5.73558779e-03],
       [9.96940255e-01],
       [6.01486027e-01],
       [9.99639392e-01],
       [4.40854952e-03],
       [4.80176397e-02],
       [7.38174468e-03],
       [1.21456020e-01],
       [9.27327294e-03],
       [9.98290122e-01],
       [9.92236212e-02],
       [1.44680515e-02],
       [9.87494648e-01],
       [1.05852261e-03],
       [3.20467725e-02],
       [8.16139579e-01],
       [6.84993691e-04],
       [9.55370963e-01],
       [8.66443872e-01],
       [6.17693245e-01],
       [2.12038338e-01],
       [8.59766185e-01],
       [4.98019606e-02],
       [2.92503722e-02],
       [9.39148426e-01],
       [8.73610556e-01],
       [9.89827275e-01],
       [4.11682855e-03],
       [5.62765896e-02],
       [2.07726404e-01],
       [9.48845863e-01],
       [4.22805687e-03],
       [1.82788316e-02],


In [16]:
score = model.evaluate(X_valid, y_valid, batch_size=64)
print(score)

[0.24222449213266373, 0.8984375]


In [17]:
model.metrics_names

['loss', 'acc']

In [18]:
y=np.int8(y_valid)
p=np.int8(predictions.round())

In [19]:
from sklearn.metrics import  confusion_matrix
cm=confusion_matrix(y,p)
cm

array([[67,  7],
       [ 6, 48]], dtype=int64)

In [20]:
#Train
predictions_train=model.predict(X_train2)



In [21]:
y_train_p=np.int8(y_train2)
p_train2=np.int8(predictions_train.round())


cm_train=confusion_matrix(y_train_p, p_train2)
cm_train

array([[11803,   623],
       [  396, 12050]], dtype=int64)

In [22]:
#Test predictions

In [23]:
Test_predictions=model.predict(X_test)

In [24]:
scores=model.evaluate(X_test, y_test)




In [25]:
print('Test accuracy',scores[1])

Test accuracy 0.86564


In [26]:
test_y=np.int8(y_test)
test_pred=np.int8(Test_predictions.round())
cm_test=confusion_matrix(test_y, test_pred)
cm_test

array([[10634,  1866],
       [ 1493, 11007]], dtype=int64)

In [27]:
#  Testing on financial data

In [30]:
import pandas as pd
import json
from keras.datasets import imdb

In [31]:
with open('NETFLIX_EARRNINGS_CALL_TRANSCRIPT_Q3_2018.json') as json_data:
    data = json.load(json_data)
    print(data)

{'text': {'0': 'Netflix, Inc. (NASDAQ:NFLX) Q3 2018 Earnings Conference Call October 16, 2018  6:00 PM ET', '1': 'Executives', '2': 'Spencer Wang - VP, Finance and Investor Relations', '3': 'Reed Hastings - Co-Founder and Chief Executive Officer', '4': 'David Wells - Chief Financial Officer', '5': 'Greg Peters - Chief Product Officer', '6': 'Ted Sarandos - Chief Content Officer', '7': 'Analysts', '8': 'Eric Sheridan - UBS', '9': 'Spencer Wang', '10': "Good afternoon and welcome to the Netflix Q3 2018 Earnings Interview. I'm Spencer Wang, VP of IR and Corporate Development. Joining me today are CEO, Reed Hastings; CFO, David Wells; Chief Content Officer, Ted Sarandos; and Chief Product Officer, Greg Peters. Our interviewer this quarter is Eric Sheridan from UBS.", '11': 'As a reminder, we will be making forward-looking statements and actual results may vary.', '12': 'With that, over to you now Eric, for the first question.', '13': 'Question-and-Answer Session', '14': 'Q - Eric Sheridan'

In [32]:
testing_dataset =pd.DataFrame(data)
testing_dataset

Unnamed: 0,text,Sentiments
0,"Netflix, Inc. (NASDAQ:NFLX) Q3 2018 Earnings C...",Neutral
1,Executives,Neutral
10,Good afternoon and welcome to the Netflix Q3 2...,Positive
100,"So a couple questions; number one, what have y...",Positive
101,Reed Hastings,Neutral
102,"Ted, do you want to take that?",Neutral
103,Ted Sarandos,Neutral
104,"Yes, I would say that, one thing that we've le...",Positive
105,"So what we're learning more and more is that, ...",Positive
106,So that gives us – and we do it over many titl...,Positive


In [33]:
tokenized_headlines = []
for each in testing_dataset['text']:
    split = each.split()
    tokenized_headlines.append(split)
tokenized_headlines

[['Netflix,',
  'Inc.',
  '(NASDAQ:NFLX)',
  'Q3',
  '2018',
  'Earnings',
  'Conference',
  'Call',
  'October',
  '16,',
  '2018',
  '6:00',
  'PM',
  'ET'],
 ['Executives'],
 ['Good',
  'afternoon',
  'and',
  'welcome',
  'to',
  'the',
  'Netflix',
  'Q3',
  '2018',
  'Earnings',
  'Interview.',
  "I'm",
  'Spencer',
  'Wang,',
  'VP',
  'of',
  'IR',
  'and',
  'Corporate',
  'Development.',
  'Joining',
  'me',
  'today',
  'are',
  'CEO,',
  'Reed',
  'Hastings;',
  'CFO,',
  'David',
  'Wells;',
  'Chief',
  'Content',
  'Officer,',
  'Ted',
  'Sarandos;',
  'and',
  'Chief',
  'Product',
  'Officer,',
  'Greg',
  'Peters.',
  'Our',
  'interviewer',
  'this',
  'quarter',
  'is',
  'Eric',
  'Sheridan',
  'from',
  'UBS.'],
 ['So',
  'a',
  'couple',
  'questions;',
  'number',
  'one,',
  'what',
  'have',
  'you',
  'learned',
  'in',
  'some',
  'of',
  'the',
  'changes',
  'you',
  'made',
  'in',
  'marketing',
  'this',
  'year?',
  'And',
  'how',
  'do',
  'you',
  '

In [34]:
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
clean_tokenized = []
loweredtokens=[]
for item in tokenized_headlines:
    tokens = []
    for token in item:
        token = token.lower()
        for punc in punctuation:
            token = token.replace(punc, "")
        tokens.append(token)
    clean_tokenized.append(tokens)

In [35]:
unique_tokens = []
single_tokens = []
for each in clean_tokenized:
    for every in each:
        if every not in unique_tokens:
            unique_tokens.append(every)
counts = pd.DataFrame(0, index=np.arange(len(clean_tokenized)),columns=unique_tokens)

In [36]:
df=testing_dataset 
df=df.replace(to_replace='Neutral', value='Positive')
df

Unnamed: 0,text,Sentiments
0,"Netflix, Inc. (NASDAQ:NFLX) Q3 2018 Earnings C...",Positive
1,Executives,Positive
10,Good afternoon and welcome to the Netflix Q3 2...,Positive
100,"So a couple questions; number one, what have y...",Positive
101,Reed Hastings,Positive
102,"Ted, do you want to take that?",Positive
103,Ted Sarandos,Positive
104,"Yes, I would say that, one thing that we've le...",Positive
105,"So what we're learning more and more is that, ...",Positive
106,So that gives us – and we do it over many titl...,Positive


In [37]:
senti=df['Sentiments']
senti
count_P=0
COUNT_N=0

for each in senti:
    
    
    if each=='Positive':
        count_P+=1
    else:
        COUNT_N+=1
        
        
print('The number of positive sentiments',count_P)
print('Thenumber of negative sentiments',COUNT_N)

The number of positive sentiments 163
Thenumber of negative sentiments 13


In [38]:
Y=df['Sentiments']

In [39]:
Y=df
Y['Encode'] = Y['Sentiments']
Y['Encode'].replace('Positive', 0, inplace=True)
Y['Encode'].replace('Negative', 1, inplace=True)

Y.tail()

Unnamed: 0,text,Sentiments,Encode
95,So it’s pretty – it's a small test right now t...,Positive,0
96,Reed Hastings,Positive,0
97,"So Eric, per Ted’s comment, we're not really f...",Positive,0
98,Eric Sheridan,Positive,0
99,Maybe continuing to move through some of the c...,Negative,1


In [40]:
word_index = imdb.get_word_index()
#word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 88584 unique tokens.


In [41]:
imdb_word=unique_tokens

In [42]:
netflix_data=[]
for word in imdb_word:
    if word in word_index:
        netflix_data.append(word)
    else:
        print(0)
        
print(netflix_data)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
['netflix', 'inc', 'earnings', 'conference', 'call', 'october', '16', '600', 'pm', 'et', 'executives', 'good', 'afternoon', 'and', 'welcome', 'to', 'the', 'interview', 'im', 'spencer', 'wang', 'vp', 'of', 'ir', 'corporate', 'development', 'joining', 'me', 'today', 'are', 'ceo', 'reed', 'hastings', 'david', 'wells', 'chief', 'content', 'officer', 'ted', 'product', 'greg', 'peters', 'our', 'interviewer', 'this', 'quarter', 'is', 'eric', 'sheridan', 'from', 'so', 'a', 'couple', 'questions', 'number', 'one', 'what', 'have', 'you', 'learned', 'in', 'some', 'changes', 'made', 'marketing', 'year', 'how', 'do', 'think', 'that', 'might', 'inform', 'go', 'market', 'as', 'company', 'going', 'forward', 'both', 'for', 'subscriber', 'growth', 'then', 'support', 'on', 'side', 'want', 'take', 'yes', 'i', 

In [43]:
Netflix_data=testing_dataset.text

In [49]:
from keras.preprocessing.text import Tokenizer
max_words = 10000
maxlen = 500

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(Netflix_data)
sequences = tokenizer.texts_to_sequences(Netflix_data)

In [50]:
data_netflix = pad_sequences(sequences, maxlen=maxlen)

In [53]:
check_predict_net=model.predict(data_netflix)
check_predict_net

array([[0.00973432],
       [0.14486413],
       [0.01823819],
       [0.9569238 ],
       [0.15783553],
       [0.40312666],
       [0.19973692],
       [0.1917745 ],
       [0.91107666],
       [0.9938437 ],
       [0.17370431],
       [0.9956554 ],
       [0.12977089],
       [0.1098602 ],
       [0.99624026],
       [0.08291131],
       [0.9941094 ],
       [0.19973692],
       [0.7120146 ],
       [0.12977089],
       [0.9782793 ],
       [0.08291131],
       [0.96177715],
       [0.12977089],
       [0.53409123],
       [0.9871668 ],
       [0.08291131],
       [0.9898461 ],
       [0.15783553],
       [0.993382  ],
       [0.12977089],
       [0.9957088 ],
       [0.15783553],
       [0.14139731],
       [0.08291131],
       [0.05076338],
       [0.9683368 ],
       [0.9894107 ],
       [0.27654776],
       [0.84984386],
       [0.12977089],
       [0.9839094 ],
       [0.9764221 ],
       [0.19973692],
       [0.84434116],
       [0.43803272],
       [0.13179354],
       [0.990

In [58]:
import numpy as np
test_y=np.int8(Y['Encode'])
test_pred=np.int8(check_predict_net.round())

In [59]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(test_y, test_pred)
cnf_matrix

array([[101,  62],
       [  3,  10]], dtype=int64)