In [37]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Load the data
data = pd.read_csv('Sentiment.csv')



In [38]:
data.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [39]:
# Filter relevant columns
data = data[['candidate', 'sentiment', 'text']]


In [43]:
data['sentiment'].isna().sum()

0

In [44]:
# Preprocess the text data
import nltk
nltk.download('stopwords')
# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

data['text'] = data['text'].apply(preprocess_text)

# Check for missing values in 'sentiment'
missing_sentiment = data['sentiment'].isnull().sum()
print(f"Number of missing values in 'sentiment': {missing_sentiment}")

Number of missing values in 'sentiment': 0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gayatri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
# Check unique values and handle unexpected values
print("Unique values in 'sentiment' before mapping:", data['sentiment'].unique())

# Map sentiment labels to numerical values
data['sentiment'] = data['sentiment'].str.lower().str.strip().map({'positive': 1, 'neutral': 0, 'negative': -1})

# Check unique values after mapping
print("Unique values in 'sentiment' after mapping:", data['sentiment'].unique())

Unique values in 'sentiment' before mapping: ['Neutral' 'Positive' 'Negative']
Unique values in 'sentiment' after mapping: [ 0  1 -1]


In [46]:
# Tokenization and padding
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, lower=True, oov_token='UNK')
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len)

In [52]:
# Model development
embedding_dim = 100

model = Sequential()
model.add(Embedding(max_words, embedding_dim))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [53]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

None


In [55]:
y = to_categorical(data['sentiment'].values, num_classes=3)

X_train, X_val, y_train, y_val = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)


In [56]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=2)

Epoch 1/10
174/174 - 27s - 153ms/step - accuracy: 0.6325 - loss: 0.8525 - val_accuracy: 0.6757 - val_loss: 0.7625
Epoch 2/10
174/174 - 20s - 118ms/step - accuracy: 0.7059 - loss: 0.6871 - val_accuracy: 0.6933 - val_loss: 0.7266
Epoch 3/10
174/174 - 19s - 111ms/step - accuracy: 0.7546 - loss: 0.5862 - val_accuracy: 0.6951 - val_loss: 0.7557
Epoch 4/10
174/174 - 25s - 144ms/step - accuracy: 0.7815 - loss: 0.5251 - val_accuracy: 0.6854 - val_loss: 0.7795
Epoch 5/10
174/174 - 17s - 100ms/step - accuracy: 0.8028 - loss: 0.4754 - val_accuracy: 0.6735 - val_loss: 0.8474
Epoch 6/10
174/174 - 17s - 100ms/step - accuracy: 0.8186 - loss: 0.4420 - val_accuracy: 0.6634 - val_loss: 0.8685
Epoch 7/10
174/174 - 17s - 99ms/step - accuracy: 0.8308 - loss: 0.4127 - val_accuracy: 0.6735 - val_loss: 0.9367
Epoch 8/10
174/174 - 18s - 106ms/step - accuracy: 0.8406 - loss: 0.3900 - val_accuracy: 0.6714 - val_loss: 0.9561
Epoch 9/10
174/174 - 17s - 98ms/step - accuracy: 0.8502 - loss: 0.3676 - val_accuracy: 0.