###IMPORTING NECESSARY LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from  tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

In [2]:
data = pd.read_csv('tweet_product_company.csv', encoding = 'latin-1')
pd.set_option('display.max_colwidth', None)
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


###DATA PREPROCESSING

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [4]:
data.isna().sum()

Unnamed: 0,0
tweet_text,1
emotion_in_tweet_is_directed_at,5802
is_there_an_emotion_directed_at_a_brand_or_product,0


As there is so much of null values in 2 nd attribute we are dropping column and also only one error in tweet_text column so ignoring it.

In [5]:
data["emotion_in_tweet_is_directed_at"].value_counts()

Unnamed: 0_level_0,count
emotion_in_tweet_is_directed_at,Unnamed: 1_level_1
iPad,946
Apple,661
iPad or iPhone App,470
Google,430
iPhone,297
Other Google product or service,293
Android App,81
Android,78
Other Apple product or service,35


As this column doesn't provide much of any information we are dropping it.

In [6]:
data = data.drop(["emotion_in_tweet_is_directed_at"], axis = 1)
data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


In [7]:
## Renaming column name for easy analysis.

data = data.rename(columns={'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'})
data.head(10)

Unnamed: 0,tweet_text,emotion
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd,No emotion toward brand or product
6,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Positive emotion
8,Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.ly/ieaVOB,Positive emotion
9,Counting down the days to #sxsw plus strong Canadian dollar means stock up on Apple gear,Positive emotion


While observing the data we could see that ther is special characters, numbers, URL, comments all are there which should be removed.
The data also have upper case letters which should be convberted to lower case.

In [8]:
def clean_data(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text


data['tweet_text'] = data['tweet_text'].astype(str).apply(clean_data)

In [9]:
data.head(20)

Unnamed: 0,tweet_text,emotion
0,wesley i have a g iphone after hrs tweeting at riseaustin it was dead i need to upgrade plugin stations at sxsw,Negative emotion
1,jessedee know about fludapp awesome ipadiphone app that youll likely appreciate for its design also theyre giving free ts at sxsw,Positive emotion
2,swonderlin can not wait for ipad also they should sale them down at sxsw,Positive emotion
3,sxsw i hope this years festival isnt as crashy as this years iphone app sxsw,Negative emotion
4,sxtxstate great stuff on fri sxsw marissa mayer google tim oreilly tech booksconferences amp matt mullenweg wordpress,Positive emotion
5,teachntech new ipad apps for speechtherapy and communication are showcased at the sxsw conference iear edchat asd,No emotion toward brand or product
6,,No emotion toward brand or product
7,sxsw is just starting ctia is around the corner and googleio is only a hop skip and a jump from there good time to be an android fan,Positive emotion
8,beautifully smart and simple idea rt madebymany thenextweb wrote about our hollergram ipad app for sxsw,Positive emotion
9,counting down the days to sxsw plus strong canadian dollar means stock up on apple gear,Positive emotion


###ENCODING THE DATASET

In [10]:
data["emotion"].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
No emotion toward brand or product,5389
Positive emotion,2978
Negative emotion,570
I can't tell,156


In [11]:
le = LabelEncoder()
data["emotion"] = le.fit_transform(data["emotion"])
num_class = len(le.classes_)
data.head()

Unnamed: 0,tweet_text,emotion
0,wesley i have a g iphone after hrs tweeting at riseaustin it was dead i need to upgrade plugin stations at sxsw,1
1,jessedee know about fludapp awesome ipadiphone app that youll likely appreciate for its design also theyre giving free ts at sxsw,3
2,swonderlin can not wait for ipad also they should sale them down at sxsw,3
3,sxsw i hope this years festival isnt as crashy as this years iphone app sxsw,1
4,sxtxstate great stuff on fri sxsw marissa mayer google tim oreilly tech booksconferences amp matt mullenweg wordpress,3


In [12]:
data["emotion"].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
2,5389
3,2978
1,570
0,156


###SPLITTING THE DATASET

In [13]:
x = data["tweet_text"]
y = data["emotion"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

###TOKENIZE AND PAD THE DATA

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(num_words = 10000)
tk.fit_on_texts(x_train)
x_train_seq = tk.texts_to_sequences(x_train)
x_test_seq = tk.texts_to_sequences(x_test)

In [15]:
## padding is done so that all sequences will be of same length when fed into the model

from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_pad = pad_sequences(x_train_seq, maxlen = 100, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen = 100, padding='post')

###BUILD THE MODEL

In [16]:
## WE are using the LSTM model

from tensorflow.keras.layers import Embedding
model = Sequential([
    Embedding(input_dim = 10000, output_dim=128, input_length= 100),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_class, activation='softmax')
])

In [17]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

###TRAIN THE MODEL

In [18]:
history = model.fit(
    x_train_pad, y_train,
    validation_split= 0.2,
    epochs= 5,
    batch_size= 32,
    verbose= 1
)

Epoch 1/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.5470 - loss: 1.0388 - val_accuracy: 0.6082 - val_loss: 0.8866
Epoch 2/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5733 - loss: 0.9606 - val_accuracy: 0.6082 - val_loss: 0.8929
Epoch 3/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5909 - loss: 0.9408 - val_accuracy: 0.6082 - val_loss: 0.8939
Epoch 4/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5800 - loss: 0.9400 - val_accuracy: 0.6082 - val_loss: 0.8899
Epoch 5/5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5979 - loss: 0.9276 - val_accuracy: 0.6082 - val_loss: 0.8878


In [19]:
model.evaluate(x_test_pad, y_test)

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5993 - loss: 0.9056


[0.9193312525749207, 0.5926333069801331]

In [20]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(x_test_pad, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5993 - loss: 0.9056
Test Accuracy: 59.26%
Test Loss: 0.9193


By using LSTM model we get an accuracy of 59% which is moderately acceptable.

In [21]:
# Make predictions on the test set

y_pred = model.predict(x_test_pad)
y_pred_classes = y_pred.argmax(axis=1)

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [22]:
# Displaying a sample prediction

for i in range(5):
    print(f"Tweet: {x_test.values[i]}")
    print(f"Predicted Sentiment: {le.inverse_transform([y_pred_classes[i]])[0]}")
    print(f"Actual Sentiment: {le.inverse_transform([y_test.values[i]])[0]}")
    print("-" * 50)

Tweet: congrats rt mention yes gowalla wins best andoid app at the team android choice awards thanks all sxsw
Predicted Sentiment: No emotion toward brand or product
Actual Sentiment: Positive emotion
--------------------------------------------------
Tweet: hey tweeps a web industry party calendar u can put on ur iphone or ical amp keep track of whats goin on at sxsw link
Predicted Sentiment: No emotion toward brand or product
Actual Sentiment: Positive emotion
--------------------------------------------------
Tweet: still a line at the sxsw ipad factory link
Predicted Sentiment: No emotion toward brand or product
Actual Sentiment: No emotion toward brand or product
--------------------------------------------------
Tweet: im watching quotdesigning ipad interfaces new navigation schemasquot at sxsw uxdes
Predicted Sentiment: No emotion toward brand or product
Actual Sentiment: No emotion toward brand or product
--------------------------------------------------
Tweet: rt mention were

In [23]:
# Checking the distribution of 4 classes in the data

unique, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class Distribution in Training Set:", class_distribution)

Class Distribution in Training Set: {0: 125, 1: 456, 2: 4311, 3: 2382}
