In [7]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

In [5]:
data = pd.read_csv("/content/Copy of Sentiment.csv", encoding='latin-1')
data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [10]:
# Preprocess text data
def preprocess_text(text):
    # Check if text is a string before applying lower()
    if isinstance(text, str):
        text = text.lower()  # lowercase convertion
        text = re.sub(r"[^a-z0-9 ]", "", text)  # eliminating punctuation and non-alphanumeric characters
        text = text.strip()  # Remove leading/trailing whitespace
        return text
    else:
        return "" # Return empty string for non-string values

data["text"] = data["text"].apply(preprocess_text)

In [11]:
X = data["text"]
y = data["sentiment"]

In [12]:
# tokenizer
max_features = 10000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

# Convert text to sequences of integers
X_sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to a fixed length
max_len = 200
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding="post")

In [13]:
from sklearn.preprocessing import OneHotEncoder

if len(set(y)) > 2:
    encoder = OneHotEncoder(sparse=False)
    y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))
else:
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)



In [14]:
# Define the Bi-LSTM model
model = Sequential()
model.add(Embedding(max_features, 128, input_length=max_len))  # Embedding layer
model.add(Bidirectional(LSTM(64, return_sequences=True)))  # Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32)))  # Another Bidirectional LSTM layer
model.add(Dense(len(set(y)), activation="softmax"))  # Output layer (modify activation for multi-class)

# Compile the model
model.compile(loss="categorical_crossentropy" if len(set(y)) > 2 else "binary_crossentropy",
              optimizer="adam", metrics=["accuracy"])

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 37ms/step - accuracy: 0.5556 - loss: 0.9029 - val_accuracy: 0.7138 - val_loss: 0.6871
Epoch 2/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 38ms/step - accuracy: 0.7705 - loss: 0.5716 - val_accuracy: 0.7213 - val_loss: 0.6730
Epoch 3/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 38ms/step - accuracy: 0.8309 - loss: 0.4537 - val_accuracy: 0.7057 - val_loss: 0.7247
Epoch 4/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 38ms/step - accuracy: 0.8794 - loss: 0.3536 - val_accuracy: 0.6927 - val_loss: 0.8163
Epoch 5/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 36ms/step - accuracy: 0.9071 - loss: 0.2702 - val_accuracy: 0.6982 - val_loss: 0.8959
Epoch 6/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 38ms/step - accuracy: 0.9248 - loss: 0.2235 - val_accuracy: 0.6887 - val_loss: 1.0030
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x7a5310233fa0>

In [15]:
# model evaluation
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6692741513252258


In [17]:
def predict_sentiment(text):
    text = preprocess_text(text)
    text_sequence = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_sequence, maxlen=max_len, padding="post")
    prediction = model.predict(text_padded)
    if len(set(y)) > 2:
      predicted_class = encoder.inverse_transform(prediction)
    else:
      predicted_class = encoder.inverse_transform(prediction.round().astype(int))
    return predicted_class[0]

# Example usage
new_text = "This movie was fantastic!"
predicted_sentiment = predict_sentiment(new_text)
print("Predicted sentiment:", predicted_sentiment)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Predicted sentiment: ['positive']
