# Sentiment Analysis

In [1]:
# Import Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('/content/tweets.csv')

# Data Understanding

In [2]:
print("Number of text entries:", len(data))


Number of text entries: 7920


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [4]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# Preprocessing

In [5]:
# Preprocess Tweets
def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+|#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Lowercase
    return text

data['clean_tweet'] = data['tweet'].apply(clean_tweet)


In [6]:
print(data.columns)


Index(['id', 'label', 'tweet', 'clean_tweet'], dtype='object')


In [7]:
data.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beaut...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks to ...
2,3,0,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememories un...
3,4,0,I'm wired I know I'm George I was made that wa...,im wired i know im george i was made that way ...
4,5,1,What amazing service! Apple won't even talk to...,what amazing service apple wont even talk to m...


# Data Split

In [8]:
# Split Data

X_train, X_test, y_train, y_test = train_test_split(
    data['clean_tweet'], data['label'], test_size=0.2, random_state=42
)

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

# Fit the tokenizer on  training data
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Now pad the sequences
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=300)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=300)


RNN

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_shape=(300,)))
model.add(SimpleRNN(128, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


  super().__init__(**kwargs)


In [11]:
model.summary()

In [12]:
model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [13]:
# Train the model using the tokenized and padded training data
model.fit(X_train_pad, y_train, batch_size=256, epochs=10, validation_data=(X_test_pad, y_test))


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 133ms/step - accuracy: 0.7324 - loss: 0.5927 - val_accuracy: 0.7191 - val_loss: 0.5752
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.7487 - loss: 0.5281 - val_accuracy: 0.7601 - val_loss: 0.4880
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.7998 - loss: 0.4248 - val_accuracy: 0.8598 - val_loss: 0.3326
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.8746 - loss: 0.2884 - val_accuracy: 0.8687 - val_loss: 0.2963
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.8922 - loss: 0.2524 - val_accuracy: 0.8750 - val_loss: 0.2819
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.9080 - loss: 0.2181 - val_accuracy: 0.8788 - val_loss: 0.2761
Epoch 7/10
[1m25/25[0m [32m━━━

<keras.src.callbacks.history.History at 0x7d69bbc4c7d0>

In [14]:
model.evaluate(X_test_pad,y_test)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8675 - loss: 0.2818


[0.31058549880981445, 0.8604797720909119]

LSTM

In [15]:
from tensorflow.keras.layers import LSTM

In [16]:
model1 = Sequential()
model1.add(Embedding(input_dim = 10000,output_dim = 100,input_shape=(300,)))
model1.add(LSTM(50,return_sequences = True))
model1.add(LSTM(128,dropout = 0.2))
model1.add(Dense(1,activation='sigmoid'))
model1.summary()


In [17]:
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['accuracy'])

In [18]:
model1.fit(X_train_pad, y_train, batch_size=256, epochs=10, validation_data=(X_test_pad, y_test))

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 73ms/step - accuracy: 0.7377 - loss: 0.5707 - val_accuracy: 0.8182 - val_loss: 0.4160
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.8497 - loss: 0.3502 - val_accuracy: 0.8718 - val_loss: 0.2885
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.8811 - loss: 0.2584 - val_accuracy: 0.8756 - val_loss: 0.2674
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.9041 - loss: 0.2226 - val_accuracy: 0.8864 - val_loss: 0.2576
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - accuracy: 0.9108 - loss: 0.2166 - val_accuracy: 0.8826 - val_loss: 0.2771
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - accuracy: 0.9152 - loss: 0.2044 - val_accuracy: 0.8883 - val_loss: 0.2639
Epoch 7/10
[1m25/25[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7d69ac2b3350>

In [19]:
model1.evaluate(X_test_pad,y_test)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8823 - loss: 0.2824


[0.302038311958313, 0.8756313323974609]

GRU

In [20]:
from tensorflow.keras.layers import GRU

In [21]:
model2 = Sequential()
model2.add(Embedding(input_dim = 10000,output_dim = 100,input_shape=(300,)))
model2.add(GRU(128,dropout=0.2))
model2.add(Dense(1,activation = 'sigmoid'))
model2.summary()

In [22]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['accuracy'])

In [23]:
model2.fit(X_train_pad, y_train, batch_size=256, epochs=10, validation_data=(X_test_pad, y_test))

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - accuracy: 0.7192 - loss: 0.5923 - val_accuracy: 0.8289 - val_loss: 0.3604
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8622 - loss: 0.3063 - val_accuracy: 0.8737 - val_loss: 0.2878
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.8968 - loss: 0.2411 - val_accuracy: 0.8807 - val_loss: 0.2733
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.9015 - loss: 0.2206 - val_accuracy: 0.8838 - val_loss: 0.2692
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.9125 - loss: 0.2074 - val_accuracy: 0.8763 - val_loss: 0.2733
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.9145 - loss: 0.1989 - val_accuracy: 0.8813 - val_loss: 0.2825
Epoch 7/10
[1m25/25[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7d69a06a9d90>

In [24]:
model2.evaluate(X_test_pad,y_test)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8829 - loss: 0.2727


[0.30348125100135803, 0.8737373948097229]

Conv 1D

In [25]:
from tensorflow.keras.layers import Conv1D,Dropout,GlobalMaxPooling1D

In [26]:
model3 = Sequential()
model3.add(Embedding(input_dim = 10000,output_dim = 100,input_shape=(300,)))
model3.add(Conv1D(64,kernel_size = 3,activation='relu'))
model3.add(Dropout(0.2))

model3.add(Conv1D(64,kernel_size = 3,activation='relu'))
model3.add(Dropout(0.2))

model3.add(GlobalMaxPooling1D())

model3.add(Dense(128,activation='relu'))
model3.add(Dense(1,activation='relu'))


In [27]:
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['accuracy'])

In [28]:
model3.fit(X_train_pad, y_train, batch_size=256, epochs=10, validation_data=(X_test_pad, y_test))

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 199ms/step - accuracy: 0.7493 - loss: 0.6111 - val_accuracy: 0.7273 - val_loss: 0.5122
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7827 - loss: 0.4160 - val_accuracy: 0.8270 - val_loss: 0.3753
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8713 - loss: 0.3331 - val_accuracy: 0.8327 - val_loss: 0.3590
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8943 - loss: 0.2615 - val_accuracy: 0.8131 - val_loss: 0.3699
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9067 - loss: 0.2325 - val_accuracy: 0.8687 - val_loss: 0.4150
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9258 - loss: 0.2176 - val_accuracy: 0.8681 - val_loss: 0.4287
Epoch 7/10
[1m25/25[0m [32m━━

<keras.src.callbacks.history.History at 0x7d69a05647d0>

In [29]:
model3.evaluate(X_test_pad,y_test)

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8792 - loss: 0.4868


[0.5911757349967957, 0.8617424368858337]

# Prediction

In [30]:
#prediction
def predict_sentiment(text):
    clean = clean_tweet(text)
    seq = tokenizer.texts_to_sequences([clean])
    pad = sequence.pad_sequences(seq, maxlen=300)
    pred_prob = model2.predict(pad)[0][0]
    return "Negative 😞" if pred_prob > 0.5 else "Positive 😊"

In [31]:
print(predict_sentiment("This latest iOS update completely ruined my iPhone—apps keep crashing and the battery drains so fast. Worst update ever!"))
print(predict_sentiment("Feeling so happy and grateful for everything we’ve achieved together!"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
Negative 😞
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Positive 😊
