In [231]:
import pandas as pd
import numpy as np

df = pd.read_csv('Cleaned_data/combined_df.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,"Mid-cap funds can deliver more, stay put: Experts",1
1,Mid caps now turn into market darlings,1
2,Hudco raises Rs 279 cr via tax-free bonds,1
3,"EXL beats profit estimates, cuts sales outlook",1
4,"Would stick to banking: Girish Pai, Centrum Br...",1


In [232]:
df['Sentiment'].value_counts()

1    9761
0    6201
Name: Sentiment, dtype: int64

### Downsample the Dependent variable

In [233]:
from sklearn.utils import resample

# Separate the majority (class 1) and minority (class 0) samples
df_majority = df[df['Sentiment'] == 1]
df_minority = df[df['Sentiment'] == 0]

# Downsample the majority class to match the count of the minority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,  
                                   n_samples=len(df_minority),  
                                   random_state=42)  

# Combine the downsampled majority class with the original minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled['Sentiment'].value_counts()


1    6201
0    6201
Name: Sentiment, dtype: int64

In [234]:
# removing non-alphanumeric characters since not needed

non_alphanum = [',','.','/','"',':',';','!','$', '%', '@','#',"'","*","(",")","&","--"]
for char in non_alphanum:
  df_downsampled['Sentence'] = df_downsampled['Sentence'].str.replace(char,"")

df_downsampled['Sentence'] = df_downsampled['Sentence'].str.replace(" s "," ")
df_downsampled['Sentence'] = df_downsampled['Sentence'].str.replace(" '","'")
df_downsampled['Sentence'] = df_downsampled['Sentence'].str.replace("  "," ")
df_downsampled['Sentence'] = df_downsampled['Sentence'].str.replace("   "," ")
df_downsampled['Sentence'] = df_downsampled['Sentence'].str.lower()

  df_downsampled['Sentence'] = df_downsampled['Sentence'].str.replace(char,"")


In [235]:
# split into training and testing

from sklearn.model_selection import train_test_split

X = df_downsampled['Sentence'].to_numpy().reshape(-1, 1)
y = df_downsampled['Sentiment'].to_numpy().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [236]:
# putting each sentence and sentiment from the training and testing dataframes into lists

training_sentences=[]
testing_sentences=[]
training_labels=[]
testing_labels=[]

for i in X_train:
  training_sentences.append(i[0])
for i in y_train:
  training_labels.append(i[0])
for i in X_test:
  testing_sentences.append(i[0])
for i in y_test:
  testing_labels.append(i[0])

In [237]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [238]:
# initialising tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='####')

In [239]:
# Define model parameters
vocab_size = 10000  # Size of vocabulary
embedding_dim = 100  # Dimension of word embeddings
max_length = 300
sequence_length = 300  # Length of input sequences
num_classes = 2  # Number of sentiment classes (e.g., positive and negative)
trunc_type='post'
padding_type='post'

In [240]:
# fitting tokenizer to training sentences
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

# padding
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [241]:

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [242]:
# # Create a feed forward nn model

# model =  tf.keras.Sequential()
# model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
# model.add(tf.keras.layers.GlobalAveragePooling1D())
# model.add(tf.keras.layers.Dense(10, activation='relu'))
# model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [243]:
# # Create a RNN model using GRU
# model = Sequential()

# # Add an Embedding layer for word embeddings
# model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))

# # Add GRU layer
# model.add(GRU(64, return_sequences=True))  
# model.add(GRU(64))  

# # Add a Dense layer for classification
# model.add(Dense(1, activation='sigmoid'))

# # Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [244]:
# Create a RNN model using LSTM
model = Sequential()

# Add an Embedding layer for word embeddings
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))

# Add LSTM layer 
model.add(LSTM(64, return_sequences=True))  
model.add(LSTM(64))  

# Add a Dense layer for classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 300, 100)          1000000   
                                                                 
 lstm_14 (LSTM)              (None, 300, 64)           42240     
                                                                 
 lstm_15 (LSTM)              (None, 64)                33024     
                                                                 
 dense_22 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1075329 (4.10 MB)
Trainable params: 1075329 (4.10 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [245]:
# fitting model

num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/10
291/291 - 45s - loss: 0.6934 - accuracy: 0.5012 - val_loss: 0.6932 - val_accuracy: 0.5002 - 45s/epoch - 155ms/step
Epoch 2/10
291/291 - 44s - loss: 0.6932 - accuracy: 0.5038 - val_loss: 0.6931 - val_accuracy: 0.5002 - 44s/epoch - 151ms/step
Epoch 3/10
291/291 - 44s - loss: 0.6933 - accuracy: 0.4969 - val_loss: 0.6932 - val_accuracy: 0.5002 - 44s/epoch - 150ms/step
Epoch 4/10
291/291 - 44s - loss: 0.6933 - accuracy: 0.5002 - val_loss: 0.6932 - val_accuracy: 0.5002 - 44s/epoch - 150ms/step
Epoch 5/10
291/291 - 44s - loss: 0.6933 - accuracy: 0.4836 - val_loss: 0.6931 - val_accuracy: 0.5002 - 44s/epoch - 150ms/step
Epoch 6/10
291/291 - 44s - loss: 0.6932 - accuracy: 0.4945 - val_loss: 0.6932 - val_accuracy: 0.4998 - 44s/epoch - 150ms/step
Epoch 7/10
291/291 - 44s - loss: 0.6932 - accuracy: 0.4949 - val_loss: 0.6931 - val_accuracy: 0.4998 - 44s/epoch - 150ms/step
Epoch 8/10
291/291 - 44s - loss: 0.6932 - accuracy: 0.4965 - val_loss: 0.6931 - val_accuracy: 0.5002 - 44s/epoch - 150