In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Load the CSV file
df = pd.read_csv('tweet_dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,text,label_name
0,0,“Worry is a down payment on a problem you may ...,optimism
1,1,My roommate: it's okay that we can't spell bec...,anger
2,2,No but that's so cute. Atsu was probably shy a...,joy
3,3,Rooneys fucking untouchable isn't he? Been fuc...,anger
4,4,it's pretty depressing when u hit pan on ur fa...,sadness
...,...,...,...
5047,5047,@user @user If #trump #whitehouse aren't held ...,anger
5048,5048,@user Which #chutiya #producer #invested in #c...,anger
5049,5049,Russia story will infuriate Trump today. Media...,anger
5050,5050,Shit getting me irritated 😠,anger


In [5]:
df = df.drop('Unnamed: 0',axis=1)
df

Unnamed: 0,text,label_name
0,“Worry is a down payment on a problem you may ...,optimism
1,My roommate: it's okay that we can't spell bec...,anger
2,No but that's so cute. Atsu was probably shy a...,joy
3,Rooneys fucking untouchable isn't he? Been fuc...,anger
4,it's pretty depressing when u hit pan on ur fa...,sadness
...,...,...
5047,@user @user If #trump #whitehouse aren't held ...,anger
5048,@user Which #chutiya #producer #invested in #c...,anger
5049,Russia story will infuriate Trump today. Media...,anger
5050,Shit getting me irritated 😠,anger


In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
import re

# 1. Data Preprocessing

In [8]:
def preprocess_text(text):
    
    text=re.sub('<[^>]*>','',text)
    text=re.sub("[^A-Za-z" "]+"," ",text).lower()
    text=re.sub("[0-9" "]+"," ",text)
    emojis=re.findall('(?::|;|=)(?:-)?(?:)|(|D|P)',text)
    text=re.sub('[\W]+',' ',text.lower()) + ' '.join(emojis).replace('-','')

    text_tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    #remove stopwords
    filtered_text = []
    for w in text_tokens:
        if w not in stop_words:
            filtered_text.append(w)

    #return to sentence
    return " ".join(filtered_text)

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
df['text'] = df.text.apply(preprocess_text)

In [12]:
df

Unnamed: 0,text,label_name
0,worry payment problem may never joyce meyer mo...,optimism
1,roommate okay spell autocorrect terrible first...,anger
2,cute atsu probably shy photos cherry helped uwu,joy
3,rooneys fucking untouchable fucking dreadful d...,anger
4,pretty depressing u hit pan ur favourite highl...,sadness
...,...,...
5047,user user trump whitehouse held accountable ac...,anger
5048,user chutiya producer invested crap deshdrohi,anger
5049,russia story infuriate trump today media other...,anger
5050,shit getting irritated,anger


In [13]:
df['label'] = df['label_name'].factorize()[0]
df.head()

Unnamed: 0,text,label_name,label
0,worry payment problem may never joyce meyer mo...,optimism,0
1,roommate okay spell autocorrect terrible first...,anger,1
2,cute atsu probably shy photos cherry helped uwu,joy,2
3,rooneys fucking untouchable fucking dreadful d...,anger,1
4,pretty depressing u hit pan ur favourite highl...,sadness,3


In [14]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


# 2. Model Building

In [15]:
import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.metrics import classification_report

In [16]:
# Split the dataset into training and testing data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
# Tokenize the data
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_df.text.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df.text.tolist(), truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [18]:
# Create input pipelines
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_df.label))
train_dataset = train_dataset.shuffle(len(train_df)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_df.label))
test_dataset = test_dataset.batch(32)

In [19]:
# Define the model
num_labels = len(train_df.label.unique())
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)


Downloading (…)"tf_model.h5";:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

# Model Training

In [20]:
# Train the model
from tensorflow.keras import optimizers, losses, metrics
model.compile(
    optimizer=optimizers.Adam(learning_rate=5e-5),
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=metrics.SparseCategoricalAccuracy()
)

In [21]:
model.fit(train_dataset, epochs=5)

Epoch 1/5


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f541c024b20>

In [22]:
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print("Test loss:", loss)
print("Test accuracy:", accuracy)

Test loss: 0.9639765620231628
Test accuracy: 0.758654773235321


In [27]:
# Generate classification report
y_true = test_df.label.tolist()
y_pred = model.predict(test_dataset).logits.argmax(axis=1).tolist()
target_names = ["optimism", "anger", "joy", "sadness"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    optimism       0.48      0.48      0.48        87
       anger       0.87      0.78      0.82       428
         joy       0.79      0.79      0.79       237
     sadness       0.67      0.80      0.73       259

    accuracy                           0.76      1011
   macro avg       0.70      0.71      0.71      1011
weighted avg       0.77      0.76      0.76      1011

