In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv("tweet_emotions2.csv", encoding = "latin")

In [4]:
df

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
74787,surprise,@MichelGW have you gift! Hope you like it! It'...
74788,joy,The world didnt give it to me..so the world MO...
74789,anger,A man robbed me today .
74790,fear,"Youu call it JEALOUSY, I call it of #Losing YO..."


In [5]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def clean_tokenized_lemmatized(tweet):
    tweet = tweet.lower()                                                       #converting the text into lower
    tweet = re.sub(r'@\w+', '', tweet)                                          #removing @mentions
    tweet = re.sub(r'#\w+', '', tweet)                                          #removing hashtags
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)   #removing url          
    tweet = re.sub(r'[^\w\s]','',tweet)                                         #removing punctuations
    tweet = re.sub(r'\d+', '', tweet)                                           #removing numbers
    tweet = re.sub(r'\s+', ' ', tweet).strip()                                  #removing extra whitespaces  
    
    lemmatizer = WordNetLemmatizer()
    stops = stopwords.words('english')
    
    tokens = nltk.word_tokenize(tweet)
    tokens = [t for t in tokens if not t in stops]
    fintokens = []
    for token in tokens:
        fintokens.append(lemmatizer.lemmatize(token))
    finaltext = " "
    return finaltext.join(fintokens)

for i in df.index:
    string = df['content'][i]
    preprostr = clean_tokenized_lemmatized(string)
    df.at[i, 'content'] = preprostr

In [6]:
#df.drop(['Unnamed: 0'], axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder
le_sentiment = LabelEncoder()
df['label'] = le_sentiment.fit_transform(df['sentiment'])

In [19]:
df.label.value_counts()

13    11887
9     11045
11    10892
16     8459
15     6249
5      5410
7      5209
0      4407
10     3842
6      1776
12     1526
8      1323
2       856
3       827
4       759
1       179
14      146
Name: label, dtype: int64

In [11]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["content"], df["label"], test_size=0.2)

In [12]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)

In [13]:
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [14]:
maxlen = 50
train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post', truncating='post')

In [20]:
num_classes = 17
train_labels = tf.keras.utils.to_categorical(train_labels, num_classes)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=50, input_length=maxlen),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model.fit(train_padded, train_labels, epochs=10, batch_size=32, validation_data=(test_padded, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ebe974bfa0>

In [24]:
test_loss, test_accuracy = model.evaluate(test_padded, test_labels)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

test_predictions = model.predict(test_padded)
test_pred_labels = np.argmax(test_predictions, axis=1)
test_true_labels = np.argmax(test_labels, axis=1)
print(classification_report(test_true_labels, test_pred_labels))

Test loss: 1.9621154069900513
Test accuracy: 0.36486396193504333
              precision    recall  f1-score   support

           0       0.50      0.40      0.44       921
           1       0.00      0.00      0.00        42
           2       0.00      0.00      0.00       148
           3       0.00      0.00      0.00       175
           4       0.00      0.00      0.00       158
           5       0.81      0.50      0.62      1129
           6       0.00      0.00      0.00       344
           7       0.24      0.31      0.27      1035
           8       0.00      0.00      0.00       287
           9       0.40      0.62      0.49      2210
          10       0.00      0.00      0.00       788
          11       0.29      0.62      0.40      2150
          12       0.00      0.00      0.00       322
          13       0.40      0.42      0.41      2396
          14       0.00      0.00      0.00        30
          15       0.86      0.13      0.23      1205
          16    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
