In [1]:
import pandas as pd
import tensorflow as tf
import keras as keras
import numpy as np

In [2]:
# If you want to run this file locally, remove the uncommented code in this code block and
# uncomment the commented code.

# df = pd.read_csv('Cleaned_data/')

# Connecting to my google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# comment this code block if running locally

df = pd.read_csv('/content/drive/MyDrive/combined_df.csv')
df['Sentence']=df['Sentence'].astype(str)

In [4]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,operating result including nonrecurring items ...,1
1,schw adding to long,1
2,dow suffers biggest fall since january as trad...,0
3,jspl surges as delhi hc stops auction of two c...,1
4,opec price plan may be the first step towards ...,1


In [5]:
# There are approx. 1000 more positive than negative articles which is a problem

df['Sentiment'].value_counts()

1    9758
0    8744
Name: Sentiment, dtype: int64

In [20]:
neg_df = df[df['Sentiment']==0]
pos_df = df[df['Sentiment']==1]

In [24]:
n = neg_df.shape[0]

In [25]:
pos_df = pos_df.sample(n,random_state=1)

In [27]:
df = pd.concat([pos_df,neg_df],axis=0).sample(frac=1)

In [39]:
df['Sentiment'].value_counts()

1    8744
0    8744
Name: Sentiment, dtype: int64

In [28]:
# splitting into training and testing

from sklearn.model_selection import train_test_split

X = df['Sentence'].to_numpy().reshape(-1, 1)
y = df['Sentiment'].to_numpy().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, stratify=y)

In [29]:
# putting each sentence and sentiment from the training and testing dataframes into lists

training_sentences=[]
testing_sentences=[]
training_labels=[]
testing_labels=[]

for i in X_train:
  training_sentences.append(i[0])
for i in y_train:
  training_labels.append(i[0])
for i in X_test:
  testing_sentences.append(i[0])
for i in y_test:
  testing_labels.append(i[0])


In [30]:
# importing tokenizer and pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# initialising tokenizer

tokenizer = Tokenizer(num_words=20000, oov_token='####')

In [31]:
vocab_size = 20000
embedding_dim = 16
max_length = 60
trunc_type='post'
padding_type='post'

In [32]:

# fitting tokenizer to training sentences

tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

# padding
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [33]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [34]:
from keras.regularizers import l1, l2, l1_l2

In [35]:
from keras.src.backend import l2_normalize
# building nn

model =  tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(15, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 60, 16)            320000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_2 (Dense)             (None, 15)                255       
                                                                 
 dense_3 (Dense)             (None, 1)                 16        
                                                                 
Total params: 320271 (1.22 MB)
Trainable params: 320271 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:

# fitting model

num_epochs = 5
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/5
492/492 - 4s - loss: 0.6710 - accuracy: 0.6472 - val_loss: 0.6017 - val_accuracy: 0.7867 - 4s/epoch - 8ms/step
Epoch 2/5
492/492 - 4s - loss: 0.4494 - accuracy: 0.8379 - val_loss: 0.3862 - val_accuracy: 0.8411 - 4s/epoch - 9ms/step
Epoch 3/5
492/492 - 3s - loss: 0.2851 - accuracy: 0.8957 - val_loss: 0.3425 - val_accuracy: 0.8485 - 3s/epoch - 6ms/step
Epoch 4/5
492/492 - 3s - loss: 0.2150 - accuracy: 0.9236 - val_loss: 0.3244 - val_accuracy: 0.8628 - 3s/epoch - 6ms/step
Epoch 5/5
492/492 - 3s - loss: 0.1716 - accuracy: 0.9411 - val_loss: 0.3256 - val_accuracy: 0.8651 - 3s/epoch - 6ms/step


In [43]:
# Trying made up sentences

sentence = ["irs claims microsoft owes up to 29B in back taxes"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

[[0.36198083]]


In [82]:
from sklearn.metrics import confusion_matrix

def rounding(output):
  rounded=[]
  for score in output:
    rounded.append(round(score[0]))
  return rounded

#Predict
y_prediction = model.predict(testing_padded)

#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(testing_labels, rounding(y_prediction), normalize='pred')



In [84]:
print(result)

[[0.86219081 0.14271457]
 [0.13780919 0.85728543]]


In [86]:
from sklearn.metrics import classification_report

print(classification_report(testing_labels, rounding(y_prediction)))


              precision    recall  f1-score   support

           0       0.86      0.84      0.85       875
           1       0.86      0.88      0.87       976

    accuracy                           0.86      1851
   macro avg       0.86      0.86      0.86      1851
weighted avg       0.86      0.86      0.86      1851

