In [1]:
# importing dependencies

import pandas as pd
import tensorflow as tf
import keras as keras
import numpy as np

In [2]:
# If you want to run this file locally, remove the uncommented code in this code block and
# uncomment the commented code.

# df = pd.read_csv('Cleaned_data/')

# Connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# comment this code block if running locally. Otherwise upload combined.csv that can be found in Cleaned_data to your google drive.

df = pd.read_csv('/content/drive/MyDrive/combined.csv')

# uncomment this code if running locally

# df = pd.read_csv('cleaned_data/combined.csv')



In [5]:
# converting Sentence column to string type

df['Sentence']=df['Sentence'].astype(str)


In [4]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,what happens when the economic momentum ends,0
1,a sexist joke cost ken fisher 4 billion in ass...,1
2,mohawk mhk q3 earnings beat sales miss q4 view...,1
3,welltower and kisco senior living announce for...,2
4,10 stocks in trouble as americans cut back on ...,0


In [6]:
# There are significant class imbalances.
# Here we will use undersampling to correct this.

df['Sentiment'].value_counts()

1    23427
2    20922
0    13240
Name: Sentiment, dtype: int64

In [7]:
# Also, we only require headlines with positive and negative sentiment (refer to eda.ipynb for why we chose to do this)

neg_df = df[df['Sentiment']==0]
pos_df = df[df['Sentiment']==2]
pos_df['Sentiment'] = pos_df['Sentiment'].apply(lambda x: 1 if x==2 else 0)
n = neg_df.shape[0]
pos_df = pos_df.sample(n,random_state=1)
df = pd.concat([pos_df,neg_df],axis=0).sample(frac=1,random_state=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_df['Sentiment'] = pos_df['Sentiment'].apply(lambda x: 1 if x==2 else 0)


In [8]:
# we now have balanced classes

df['Sentiment'].value_counts()

0    13240
1    13240
Name: Sentiment, dtype: int64

In [9]:
# removing numbers and rows that are blank from each headline as they may confuse the NN.

df['Sentence'] = df['Sentence'].str.replace('\d+', '',regex=True)
df = df[df['Sentence']!=" "]
df = df[df['Sentence']!="  "]
df = df[df['Sentence']!="   "]

In [10]:
# splitting into training and testing

from sklearn.model_selection import train_test_split

X = df['Sentence'].to_numpy().reshape(-1, 1)
y = df['Sentiment'].to_numpy().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1, stratify=y,random_state=1)

In [11]:
# putting each sentence from the training and testing dataframes into lists for the tokenizer

training_sentences=[]
testing_sentences=[]

for i in X_train:
  training_sentences.append(i[0])
for i in X_test:
  testing_sentences.append(i[0])


In [13]:
# importing tokenizer and pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# initialising tokenizer

tokenizer = Tokenizer(num_words=20000, oov_token='####')

In [14]:
vocab_size = 20000
embedding_dim = 16
max_length = 60
trunc_type='post'
padding_type='post'

In [15]:

# fitting tokenizer to training sentences

tokenizer.fit_on_texts(training_sentences)

# tokenizing the sentences (converting each word to a digit)

training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# padding each tokenzier. This

training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [21]:
# building nn

# intialising model
model =  tf.keras.Sequential()
# this layer embeds each word in a 16 dimensional space
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
# the output of the embedding layer is a 60 by 16 matrix for each sentence.
# the GlobalAveragePooling1D layer then takes each matrix and and averages across each row
# to create a 16 dimensional vector for each sentence
model.add(tf.keras.layers.GlobalAveragePooling1D())
# This layer then takes the output of the GlobalAveragePooling1D and is just a hidden layer
model.add(tf.keras.layers.Dense(15, activation='relu'))
# since we are doing binary classification we have chosen a single neuron output layer with sigmoid activation function
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 60, 16)            320000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_4 (Dense)             (None, 15)                255       
                                                                 
 dense_5 (Dense)             (None, 1)                 16        
                                                                 
Total params: 320271 (1.22 MB)
Trainable params: 320271 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:

# fitting model

num_epochs = 4
history = model.fit(training_padded, y_train, epochs=num_epochs, validation_data=(testing_padded, y_test), verbose=2)

Epoch 1/4
745/745 - 5s - loss: 0.6147 - accuracy: 0.7002 - val_loss: 0.4466 - val_accuracy: 0.8353 - 5s/epoch - 7ms/step
Epoch 2/4
745/745 - 4s - loss: 0.3410 - accuracy: 0.8713 - val_loss: 0.3292 - val_accuracy: 0.8614 - 4s/epoch - 6ms/step
Epoch 3/4
745/745 - 5s - loss: 0.2431 - accuracy: 0.9091 - val_loss: 0.3079 - val_accuracy: 0.8761 - 5s/epoch - 7ms/step
Epoch 4/4
745/745 - 4s - loss: 0.1968 - accuracy: 0.9305 - val_loss: 0.3182 - val_accuracy: 0.8739 - 4s/epoch - 5ms/step


In [30]:
# creating classification report

y_prediction = model.predict(testing_padded)

from sklearn.metrics import classification_report

# function that rounds output of NN
def rounding(output):
  rounded=[]
  for score in output:
    rounded.append(round(score[0]))
  return rounded


print(classification_report(y_test, rounding(y_prediction)))


              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1324
           1       0.91      0.83      0.87      1324

    accuracy                           0.87      2648
   macro avg       0.88      0.87      0.87      2648
weighted avg       0.88      0.87      0.87      2648



In [28]:
# Saving model

model.save('positive_negative_model.h5')

  saving_api.save_model(


In [29]:
import json

# Saving tokenizer

tokenizer_json = tokenizer.to_json()
with open('new_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))


