In [30]:
import pandas as pd
import tensorflow as tf
import keras as keras
import numpy as np

In [31]:

df = pd.read_csv('Cleaned_data/combined_df.csv')



In [32]:
# There are approx. 3500 more positive than negative articles which is a problem

df['Sentiment'].value_counts()

1    9761
0    6201
Name: Sentiment, dtype: int64

In [33]:
df["Sentence"]

0        Mid-cap funds can deliver more, stay put: Experts
1                   Mid caps now turn into market darlings
2                Hudco raises Rs 279 cr via tax-free bonds
3           EXL beats profit estimates, cuts sales outlook
4        Would stick to banking: Girish Pai, Centrum Br...
                               ...                        
15957    Industry body CII said #discoms are likely to ...
15958    #Gold prices slip below Rs 46,000 as #investor...
15959    Workers at Bajaj Auto have agreed to a 10% wag...
15960    #Sharemarket LIVE: Sensex off day’s high, up 6...
15961    #Sensex, #Nifty climb off day's highs, still u...
Name: Sentence, Length: 15962, dtype: object

In [34]:
# removing non-alphanumeric characters since not needed

non_alphanum = [',','.','/','"',':',';','!','@','#','$','%',"'","*","(",")","&","--"]
for char in non_alphanum:
  df['Sentence'] = df['Sentence'].str.replace(char,"")

df['Sentence'] = df['Sentence'].str.replace(" s "," ")
df['Sentence'] = df['Sentence'].str.replace(" '","'")
df['Sentence'] = df['Sentence'].str.replace("  "," ")
df['Sentence'] = df['Sentence'].str.replace("   "," ")
df['Sentence'] = df['Sentence'].str.lower()

  df['Sentence'] = df['Sentence'].str.replace(char,"")


In [35]:
# splitting into training and testing

from sklearn.model_selection import train_test_split

X = df['Sentence'].to_numpy().reshape(-1, 1)
y = df['Sentiment'].to_numpy().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, stratify=y)

In [36]:
# putting each sentence and sentiment from the training and testing dataframes into lists

training_sentences=[]
testing_sentences=[]
training_labels=[]
testing_labels=[]

for i in X_train:
  training_sentences.append(i[0])
for i in y_train:
  training_labels.append(i[0])
for i in X_test:
  testing_sentences.append(i[0])
for i in y_test:
  testing_labels.append(i[0])


In [37]:
# importing tokenizer and pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# initialising tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='####')

In [38]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'

In [39]:
import json
# fitting tokenizer to training sentences
tokenizer.fit_on_texts(training_sentences)

tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

word_index = tokenizer.word_index

# padding
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [40]:
# word index is a dictionary that indexes each word found in the training sentences

print(word_index)



In [41]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [42]:
# building nn

model =  tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_2 (Dense)             (None, 10)                170       
                                                                 


 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 160181 (625.71 KB)
Trainable params: 160181 (625.71 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
# fitting model

num_epochs = 5
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/5
449/449 - 1s - loss: 0.6668 - accuracy: 0.6115 - val_loss: 0.6597 - val_accuracy: 0.6118 - 527ms/epoch - 1ms/step
Epoch 2/5
449/449 - 0s - loss: 0.6332 - accuracy: 0.6223 - val_loss: 0.5859 - val_accuracy: 0.7138 - 308ms/epoch - 685us/step
Epoch 3/5
449/449 - 0s - loss: 0.5003 - accuracy: 0.7821 - val_loss: 0.4502 - val_accuracy: 0.8372 - 305ms/epoch - 680us/step
Epoch 4/5
449/449 - 0s - loss: 0.3683 - accuracy: 0.8640 - val_loss: 0.3998 - val_accuracy: 0.8209 - 306ms/epoch - 682us/step
Epoch 5/5
449/449 - 0s - loss: 0.2983 - accuracy: 0.8917 - val_loss: 0.3500 - val_accuracy: 0.8566 - 297ms/epoch - 662us/step


In [44]:
model.save('first_model.h5')  # Save the model as an HDF5 file


  saving_api.save_model(
