In [45]:
from keras.layers import Embedding
import numpy as np
import json


# The Embedding layer takes at least two arguments:
# the number of possible tokens, here 1000 (1 + maximum word index),
# and the dimensionality of the embeddings, here 64.


In [25]:
import os

imdb_dir = 'aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
test_dir=os.path.join(imdb_dir, 'test')

labels = []
texts = []

for label_type in [
    'neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname),encoding="utf-8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
                
                
labels_test=[]
texts_test=[]


for label_type in [
    'neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname),encoding="utf-8")
            texts_test.append(f.read())
            f.close()
            if label_type == 'neg':
                labels_test.append(0)
            else:
                labels_test.append(1)

In [110]:
test_dat = pd.DataFrame()
test_dat['test'] = texts_test
test_dat['labels'] = labels_test
test_dat.head()

Unnamed: 0,test,labels
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100  # We will cut reviews after 100 words
training_samples = 10000 # We will be training on 200 samples
validation_samples = 200  # We will be validating on 10000 samples
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [4]:
x_train.shape

(10000, 100)

In [5]:
x_val.shape

(200, 100)

In [16]:
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [7]:
#lines =("glove.6B.100d.txt",encoding='utf8')

#import gensim
#model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False)
#print(model)

In [17]:
maxwords=10000
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [21]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add((LSTM(50, return_sequences=True, dropout=0.5)))
model.add(Flatten())
model.add(Dense(32, activation='relu'))



model.add(Dense(1, activation='sigmoid'))
#model.summary()

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100, 50)           30200     
_________________________________________________________________
flatten_5 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 32)                160032    
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 33        
Total params: 1,190,265
Trainable params: 190,265
Non-trainable params: 1,000,000
_________________________________________________________________
Train on 10000 samples, validate on 200 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
E

In [10]:

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 50)           30200     
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                160032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,190,265
Trainable params: 190,265
Non-trainable params: 1,000,000
_________________________________________________________________


In [23]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add((LSTM(50, return_sequences=True, dropout=0.5)))
model.add(Flatten())
model.add(Dense(32, activation='relu'))



model.add(Dense(1, activation='sigmoid'))
#model.summary()

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=80,
                    batch_size=512,
                    validation_data=(x_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 100, 50)           30200     
_________________________________________________________________
flatten_7 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 32)                160032    
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 33        
Total params: 1,190,265
Trainable params: 190,265
Non-trainable params: 1,000,000
_________________________________________________________________
Train on 10000 samples, validate on 200 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
E

Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [32]:
score=model.evaluate(data_t,labels_test)



In [33]:
print('Test accuracy:',score[1])

Test accuracy 0.55064


In [36]:
scores_train=model.evaluate(x_train,y_train)
print('Train accuracy:',scores_train[1])

Train accuracy: 0.6846


In [None]:
# IMDB Test data

In [26]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts_test)
sequences = tokenizer.texts_to_sequences(texts_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data_t = pad_sequences(sequences, maxlen=maxlen)

labels_test = np.asarray(labels_test)
print('Shape of data tensor:', data_t.shape)
print('Shape of label tensor:', labels_test.shape)

Found 87393 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [29]:
test_pred=model.predict(data_t)

In [30]:
test_y=np.int8(labels_test)
test_cm=np.int8(test_pred.round())

In [31]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(test_y ,test_cm)
cnf_matrix

array([[7331, 5169],
       [6065, 6435]], dtype=int64)

In [37]:
#Confusion Matrix train data

In [38]:
train_pred=model.predict(x_train)

In [41]:
train_y=np.int8(y_train)
train_cm=np.int8(train_pred.round())

In [42]:
cnf_matrix_train = metrics.confusion_matrix(train_y ,train_cm)
cnf_matrix_train

array([[3750, 1225],
       [1929, 3096]], dtype=int64)

In [62]:
# Testing data on Financial dataset
import pandas as pd
from keras.datasets import imdb

In [47]:
with open('NETFLIX_EARRNINGS_CALL_TRANSCRIPT_Q3_2018.json') as json_data:
    data = json.load(json_data)
    print(data)

{'text': {'0': 'Netflix, Inc. (NASDAQ:NFLX) Q3 2018 Earnings Conference Call October 16, 2018  6:00 PM ET', '1': 'Executives', '2': 'Spencer Wang - VP, Finance and Investor Relations', '3': 'Reed Hastings - Co-Founder and Chief Executive Officer', '4': 'David Wells - Chief Financial Officer', '5': 'Greg Peters - Chief Product Officer', '6': 'Ted Sarandos - Chief Content Officer', '7': 'Analysts', '8': 'Eric Sheridan - UBS', '9': 'Spencer Wang', '10': "Good afternoon and welcome to the Netflix Q3 2018 Earnings Interview. I'm Spencer Wang, VP of IR and Corporate Development. Joining me today are CEO, Reed Hastings; CFO, David Wells; Chief Content Officer, Ted Sarandos; and Chief Product Officer, Greg Peters. Our interviewer this quarter is Eric Sheridan from UBS.", '11': 'As a reminder, we will be making forward-looking statements and actual results may vary.', '12': 'With that, over to you now Eric, for the first question.', '13': 'Question-and-Answer Session', '14': 'Q - Eric Sheridan'

In [50]:
testing_dataset =pd.DataFrame(data)
testing_dataset

Unnamed: 0,text,Sentiments
0,"Netflix, Inc. (NASDAQ:NFLX) Q3 2018 Earnings C...",Neutral
1,Executives,Neutral
10,Good afternoon and welcome to the Netflix Q3 2...,Positive
100,"So a couple questions; number one, what have y...",Positive
101,Reed Hastings,Neutral
102,"Ted, do you want to take that?",Neutral
103,Ted Sarandos,Neutral
104,"Yes, I would say that, one thing that we've le...",Positive
105,"So what we're learning more and more is that, ...",Positive
106,So that gives us – and we do it over many titl...,Positive


In [52]:
tokenized_headlines = []
for each in testing_dataset['text']:
    split = each.split()
    tokenized_headlines.append(split)
tokenized_headlines

[['Netflix,',
  'Inc.',
  '(NASDAQ:NFLX)',
  'Q3',
  '2018',
  'Earnings',
  'Conference',
  'Call',
  'October',
  '16,',
  '2018',
  '6:00',
  'PM',
  'ET'],
 ['Executives'],
 ['Good',
  'afternoon',
  'and',
  'welcome',
  'to',
  'the',
  'Netflix',
  'Q3',
  '2018',
  'Earnings',
  'Interview.',
  "I'm",
  'Spencer',
  'Wang,',
  'VP',
  'of',
  'IR',
  'and',
  'Corporate',
  'Development.',
  'Joining',
  'me',
  'today',
  'are',
  'CEO,',
  'Reed',
  'Hastings;',
  'CFO,',
  'David',
  'Wells;',
  'Chief',
  'Content',
  'Officer,',
  'Ted',
  'Sarandos;',
  'and',
  'Chief',
  'Product',
  'Officer,',
  'Greg',
  'Peters.',
  'Our',
  'interviewer',
  'this',
  'quarter',
  'is',
  'Eric',
  'Sheridan',
  'from',
  'UBS.'],
 ['So',
  'a',
  'couple',
  'questions;',
  'number',
  'one,',
  'what',
  'have',
  'you',
  'learned',
  'in',
  'some',
  'of',
  'the',
  'changes',
  'you',
  'made',
  'in',
  'marketing',
  'this',
  'year?',
  'And',
  'how',
  'do',
  'you',
  '

In [53]:
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
clean_tokenized = []
loweredtokens=[]
for item in tokenized_headlines:
    tokens = []
    for token in item:
        token = token.lower()
        for punc in punctuation:
            token = token.replace(punc, "")
        tokens.append(token)
    clean_tokenized.append(tokens)

In [55]:
unique_tokens = []
single_tokens = []
for each in clean_tokenized:
    for every in each:
        if every not in unique_tokens:
            unique_tokens.append(every)
counts = pd.DataFrame(0, index=np.arange(len(clean_tokenized)),columns=unique_tokens)

In [105]:
df=testing_dataset 
df=df.replace(to_replace='Neutral', value='Positive')
df

Unnamed: 0,text,Sentiments
0,"Netflix, Inc. (NASDAQ:NFLX) Q3 2018 Earnings C...",Positive
1,Executives,Positive
10,Good afternoon and welcome to the Netflix Q3 2...,Positive
100,"So a couple questions; number one, what have y...",Positive
101,Reed Hastings,Positive
102,"Ted, do you want to take that?",Positive
103,Ted Sarandos,Positive
104,"Yes, I would say that, one thing that we've le...",Positive
105,"So what we're learning more and more is that, ...",Positive
106,So that gives us – and we do it over many titl...,Positive


In [58]:
senti=df['Sentiments']
senti
count_P=0
COUNT_N=0

for each in senti:
    
    
    if each=='Positive':
        count_P+=1
    else:
        COUNT_N+=1
        
        
print('The number of positive sentiments',count_P)
print('Thenumber of negative sentiments',COUNT_N)

The number of positive sentiments 163
Thenumber of negative sentiments 13


In [78]:
Y=df['Sentiments']

In [79]:
for n,i in enumerate(Y):
    if i == 'Negative' :
        Y[n] = 0
    else:
        
        Y[n] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [80]:
Y

0      1
1      1
10     1
100    1
101    1
102    1
103    1
104    1
105    1
106    1
107    1
108    0
109    1
11     1
110    1
111    1
112    1
113    1
114    1
115    1
116    1
117    1
118    0
119    1
12     1
120    1
121    1
122    0
123    1
124    0
      ..
72     0
73     1
74     1
75     1
76     0
77     1
78     1
79     1
8      1
80     1
81     1
82     1
83     1
84     1
85     1
86     1
87     1
88     1
89     1
9      1
90     1
91     1
92     1
93     1
94     1
95     1
96     1
97     1
98     1
99     0
Name: Sentiments, Length: 176, dtype: object

In [63]:
word_index = imdb.get_word_index()
#word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 88584 unique tokens.


In [64]:
imdb_word=unique_tokens

In [65]:
netflix_data=[]
for word in imdb_word:
    if word in word_index:
        netflix_data.append(word)
    else:
        print(0)
        
print(netflix_data)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
['netflix', 'inc', 'earnings', 'conference', 'call', 'october', '16', '600', 'pm', 'et', 'executives', 'good', 'afternoon', 'and', 'welcome', 'to', 'the', 'interview', 'im', 'spencer', 'wang', 'vp', 'of', 'ir', 'corporate', 'development', 'joining', 'me', 'today', 'are', 'ceo', 'reed', 'hastings', 'david', 'wells', 'chief', 'content', 'officer', 'ted', 'product', 'greg', 'peters', 'our', 'interviewer', 'this', 'quarter', 'is', 'eric', 'sheridan', 'from', 'so', 'a', 'couple', 'questions', 'number', 'one', 'what', 'have', 'you', 'learned', 'in', 'some', 'changes', 'made', 'marketing', 'year', 'how', 'do', 'think', 'that', 'might', 'inform', 'go', 'market', 'as', 'company', 'going', 'forward', 'both', 'for', 'subscriber', 'growth', 'then', 'support', 'on', 'side', 'want', 'take', 'yes', 'i', 

In [66]:
Netflix_data=testing_dataset.text

In [71]:
from keras.preprocessing.text import Tokenizer
max_words = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(Netflix_data)
sequences = tokenizer.texts_to_sequences(Netflix_data)

In [109]:
tokenizer.fit_on_texts('So a couple questions; number one, what have you learned in some of the changes you made in marketing this year? And how do you think that might inform how you go to market as a company going forward both for subscriber growth and then support on the content side?')
sequences1 = tokenizer.texts_to_sequences('So a couple questions; number one, what have you learned in some of the changes you made in marketing this year? And how do you think that might inform how you go to market as a company going forward both for subscriber growth and then support on the content side?')
data_test = pad_sequences(sequences1, maxlen=maxlen)
data_test
check_predict_net1=model.predict_classes(data_test)
len(check_predict_net1)

264

In [72]:
data_netflix = pad_sequences(sequences, maxlen=maxlen)

In [86]:
data_netflix

array([[   0,    0,    0, ...,  674,  675,  433],
       [   0,    0,    0, ...,    0,    0,  676],
       [   0,    0,    0, ...,   42,   36,  437],
       ...,
       [   0,    0,    0, ...,  129,  315,  662],
       [   0,    0,    0, ...,    0,   26,   42],
       [   0,    0,    0, ..., 1306,  164,  566]])

In [108]:
check_predict_net=model.predict_classes(data_netflix)
len(check_predict_net)

176

In [112]:
test_y=np.int8(Y)
test_pred=np.int8(check_predict_net.round())

In [113]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(test_y, test_pred)
cnf_matrix

array([[  2,  11],
       [101,  62]], dtype=int64)