In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from sklearn.metrics import classification_report

In [5]:
df = pd.read_csv("tweet_emotions2.csv", encoding = "latin-1")

In [6]:
df

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
74787,surprise,@MichelGW have you gift! Hope you like it! It'...
74788,joy,The world didnt give it to me..so the world MO...
74789,anger,A man robbed me today .
74790,fear,"Youu call it JEALOUSY, I call it of #Losing YO..."


In [8]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def clean_tokenized_lemmatized(tweet):
    tweet = tweet.lower()                                                       #converting the text into lower
    tweet = re.sub(r'@\w+', '', tweet)                                          #removing @mentions
    tweet = re.sub(r'#\w+', '', tweet)                                          #removing hashtags
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)   #removing url          
    tweet = re.sub(r'[^\w\s]','',tweet)                                         #removing punctuations
    tweet = re.sub(r'\d+', '', tweet)                                           #removing numbers
    tweet = re.sub(r'\s+', ' ', tweet).strip()                                  #removing extra whitespaces  
    
    lemmatizer = WordNetLemmatizer()
    stops = stopwords.words('english')
    
    tokens = nltk.word_tokenize(tweet)
    tokens = [t for t in tokens if not t in stops]
    fintokens = []
    for token in tokens:
        fintokens.append(lemmatizer.lemmatize(token))
    finaltext = " "
    return finaltext.join(fintokens)

for i in df.index:
    string = df['content'][i]
    preprostr = clean_tokenized_lemmatized(string)
    df.at[i, 'content'] = preprostr

In [3]:
#df.drop(['Unnamed: 0'], axis=1)

In [10]:
from sklearn.preprocessing import LabelEncoder
le_sentiment = LabelEncoder()
df['Label'] = le_sentiment.fit_transform(df['sentiment'])

In [11]:
df.Label.value_counts()

13    11887
9     11045
11    10892
16     8459
15     6249
5      5410
7      5209
0      4407
10     3842
6      1776
12     1526
8      1323
2       856
3       827
4       759
1       179
14      146
Name: Label, dtype: int64

In [13]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['content'])

In [14]:
sequences = tokenizer.texts_to_sequences(df['content'])
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [15]:
labels = np.array(df['Label'])

In [16]:
train_size = int(0.8 * len(padded_sequences))
train_sequences = padded_sequences[:train_size]
train_labels = labels[:train_size]
val_sequences = padded_sequences[train_size:train_size+1000]
val_labels = labels[train_size:train_size+1000]
test_sequences = padded_sequences[train_size+1000:]
test_labels = labels[train_size+1000:]

In [17]:
model = Sequential([
    Embedding(5000, 32, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(17, activation='softmax')
])

In [18]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_sequences, train_labels, validation_data=(val_sequences, val_labels), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
test_sequences = padded_sequences[train_size+1000:]
test_labels = labels[train_size+1000:]
test_loss, test_acc = model.evaluate(test_sequences, test_labels)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

y_pred = model.predict(test_sequences)
y_pred = np.argmax(y_pred, axis=1)

report = classification_report(test_labels, y_pred)
print(report)

Test Loss: 1.8659764528274536
Test Accuracy: 0.4961673617362976
              precision    recall  f1-score   support

           0       0.63      0.48      0.54      1685
           2       0.46      0.03      0.06       326
           5       0.75      0.56      0.65      2177
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.67      0.51      0.58      4412
          10       0.00      0.00      0.00         0
          11       0.35      0.76      0.48       907
          12       0.00      0.00      0.00         0
          13       0.57      0.47      0.52      2774
          14       0.91      0.68      0.78        57
          15       0.54      0.36      0.43      1621
          16       0.00      0.00      0.00         0

    accuracy                           0.50     13959
   macro avg       0.37      0.30      0.31     13959
weighted avg       0.62      0.50      0.54     13959



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
