In [1]:
import pandas as pd

data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='Latin', header=None)
df = data.iloc[750000:850001]
columns = ['Label', 'User ID', 'Datetime', 'Query', 'UserName', 'Tweet Text']
df.columns = columns

In [2]:
df.head(10)

Unnamed: 0,Label,User ID,Datetime,Query,UserName,Tweet Text
750000,0,2285370474,Mon Jun 22 15:02:48 PDT 2009,NO_QUERY,idmoore,"@Opotopo small slip on Tryfan few weeks back, ..."
750001,0,2285370823,Mon Jun 22 15:02:49 PDT 2009,NO_QUERY,xbeautifulmessx,@Idristwilight You can post HAN when you want....
750002,0,2285371185,Mon Jun 22 15:02:51 PDT 2009,NO_QUERY,thefirstsight,@rose_7 Ohh poor jan please tell her that if ...
750003,0,2285371495,Mon Jun 22 15:02:52 PDT 2009,NO_QUERY,Sarah2713,Finally home from work...It was a looong day!!...
750004,0,2285371762,Mon Jun 22 15:02:54 PDT 2009,NO_QUERY,dierockerfrau,im very sad 4 chantelle and tom
750005,0,2285372377,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,alexbates,I chatted with someone on the online Apple sto...
750006,0,2285372393,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,captainsubtle,Back to office to empty aircon water tank emp...
750007,0,2285372511,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,LizLemonCologne,@ToxicMelvin Too late However it works now. A...
750008,0,2285372519,Mon Jun 22 15:02:57 PDT 2009,NO_QUERY,esben_thomsen,@exljbris it can't connect
750009,0,2285373042,Mon Jun 22 15:03:00 PDT 2009,NO_QUERY,jlcookaz,Missing my 20yr old baby-moved to WA.


## Cleaning

In [3]:
# Step 2: Remove duplicates and useless data
df = df.drop(['User ID', 'Datetime', 'Query', 'UserName'], axis=1)

df = df.drop_duplicates()
df = df.dropna()

df.head(10)

Unnamed: 0,Label,Tweet Text
750000,0,"@Opotopo small slip on Tryfan few weeks back, ..."
750001,0,@Idristwilight You can post HAN when you want....
750002,0,@rose_7 Ohh poor jan please tell her that if ...
750003,0,Finally home from work...It was a looong day!!...
750004,0,im very sad 4 chantelle and tom
750005,0,I chatted with someone on the online Apple sto...
750006,0,Back to office to empty aircon water tank emp...
750007,0,@ToxicMelvin Too late However it works now. A...
750008,0,@exljbris it can't connect
750009,0,Missing my 20yr old baby-moved to WA.


In [4]:
# Import libraries
import re
import nltk
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

# Define a function to preprocess tweets
def preprocess_tweet(tweet):
  # Remove URLs
  tweet = re.sub(r"http\S+", "", tweet)
  # Remove mentions
  tweet = re.sub(r"@\w+", "", tweet)
  # Remove hashtags
  tweet = re.sub(r"#\w+", "", tweet)
  # Remove CashTags
  tweet = re.sub(r'\$[^\s]+', '', tweet) 
  # Remove punctuation
  tweet = re.sub(r"[^\w\s]", "", tweet)
  # Convert to lowercase
  tweet = tweet.lower()
  # Remove stopwords
  tweet = tweet.split()
  tweet = [word for word in tweet if word not in stopWords]
  # Join words back
  tweet = " ".join(tweet)
  return tweet

# Apply the function to the tweet column
df["Tweet Text"] = df["Tweet Text"].apply(preprocess_tweet)

df.head(10)





Unnamed: 0,Label,Tweet Text
750000,0,small slip tryfan weeks back felt side pull di...
750001,0,post han want great still working tld though g...
750002,0,ohh poor jan please tell cans send us email
750003,0,finally home workit looong day monday
750004,0,im sad 4 chantelle tom
750005,0,chatted someone online apple store said would ...
750006,0,back office empty aircon water tank empty offi...
750007,0,late however works really happy
750008,0,cant connect
750009,0,missing 20yr old babymoved wa


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatization(tweet):
    # Tokenize tweet
    tokens = word_tokenize(tweet)
    # Lemmatize the tokens and then concatenate
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_tweet = ' '.join(tokens)
    return lemmatized_tweet

df["Tweet Text"] = df["Tweet Text"].apply(lemmatization)

df.head(10)

Unnamed: 0,Label,Tweet Text
750000,0,small slip tryfan week back felt side pull did...
750001,0,post han want great still working tld though g...
750002,0,ohh poor jan please tell can send u email
750003,0,finally home workit looong day monday
750004,0,im sad 4 chantelle tom
750005,0,chatted someone online apple store said would ...
750006,0,back office empty aircon water tank empty offi...
750007,0,late however work really happy
750008,0,cant connect
750009,0,missing 20yr old babymoved wa


As you can see, it works correctly and comparing the above table with its previous one, if you for example look at row number 8, the 's' character from 'works' is removed.

In [6]:
df.to_csv('data.csv', index=False)

# Training models

## TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Split the data into train and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df["Tweet Text"], df["Label"].replace(4, 1), test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create a pipeline with TF-IDF, and logistic regression
tfidf_PL = Pipeline([
  ("tfidf", TfidfVectorizer()),
  ("logreg", LogisticRegression(max_iter=200))
])


param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'tfidf__max_features': [10000, 100000],
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100]
}


tfidf_grid_search = GridSearchCV(tfidf_PL, param_grid, cv=5, scoring='accuracy')
tfidf_grid_search.fit(X_train, y_train)

# Print best parameters for each model
print("Best parameters for TF-IDF model are:", tfidf_grid_search.best_params_)

# Predict the labels on the validation set
y_val_pred = tfidf_grid_search.predict(X_val)

# Calculate the accuracy score
val_accuracy = accuracy_score(y_val, y_val_pred)

# Print the result
print("Validation accuracy of TF-IDF model: ", val_accuracy)

Best parameters for TF-IDF model are: {'logreg__C': 1, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 2)}

Validation accuracy of TF-IDF model:  0.7776102904230731


In [11]:
# Get the best model from the grid search
best_tfidf_model = tfidf_grid_search.best_estimator_

# Predict the labels on the test set
y_test_pred = best_tfidf_model.predict(X_test)

# Calculate the accuracy score
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the result
print("Test accuracy of TF-IDF model: ", test_accuracy)

Test accuracy of TF-IDF model:  0.7819533762057878


In [12]:
# Import libraries
import joblib

# Save the best model to a file
joblib.dump(best_tfidf_model, "best_tfidf_model.pkl", compress=1)

['best_tfidf_model.pkl']

## Bag of Words

In [14]:
# Create a pipeline with BoW, and logistic regression
BoW_PL = Pipeline([
  ("bow", CountVectorizer()),
  ("logreg", LogisticRegression(max_iter=200))
])


param_grid = {
    'bow__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'bow__max_features': [10000, 100000],
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100]
}


BoW_grid_search = GridSearchCV(BoW_PL, param_grid, cv=5, scoring='accuracy')
BoW_grid_search.fit(X_train, y_train)

# Print best parameters for each model
print("Best parameters for BoW model are:", BoW_grid_search.best_params_)

# Predict the labels on the validation set
y_val_pred = BoW_grid_search.predict(X_val)

# Calculate the accuracy score
val_accuracy = accuracy_score(y_val, y_val_pred)

# Print the result
print("Validation accuracy of BoW model: ", val_accuracy)

Best parameters for BoW model are: {'bow__max_features': 100000, 'bow__ngram_range': (1, 2), 'logreg__C': 1}

Validation accuracy of BoW model:  0.7753994573409707


In [15]:
# Get the best model from the grid search
best_BoW_model = BoW_grid_search.best_estimator_

# Predict the labels on the test set
y_test_pred = best_BoW_model.predict(X_test)

# Calculate the accuracy score
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the result
print("Test accuracy of BoW model: ", test_accuracy)

Test accuracy of BoW model:  0.777532154340836


In [16]:
# Import libraries
import joblib

# Save the best model to a file
joblib.dump(best_BoW_model, "best_BoW_model.pkl", compress=1)

['best_BoW_model.pkl']

## Pre-Trained (HuggingFace)

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('/kaggle/input/tweet-sentiment/data.csv', encoding='Latin')

df.dropna()
# Replace any empty strings with NaN and then remove them
df = df.replace("", np.nan).dropna()

df.head()

Unnamed: 0,Label,Tweet Text
0,0,small slip tryfan week back felt side pull did...
1,0,post han want great still working tld though g...
2,0,ohh poor jan please tell can send u email
3,0,finally home workit looong day monday
4,0,im sad 4 chantelle tom


In [2]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, TFAutoModelForSequenceClassification

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df["Tweet Text"], df["Label"].replace(4, 1), test_size=0.2, random_state=42)

# Load the pre-trained BERT model and tokenizer
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Tokenize the texts and convert them to tensors
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

# Fine-tune the model on the train set
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

# Evaluate the model on the test set
model.evaluate(test_dataset.batch(16))


2024-02-01 22:30:27.099793: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-01 22:30:27.099894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-01 22:30:27.223984: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Epoch 1/3


I0000 00:00:1706826773.207638      83 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


[0.52691251039505, 0.7877182364463806]

As you can see, the sparse categorical accuracy at the end of training had reached 86% with just 3 epochs, which is so great. Also, the accuracy on test set is about 79% which is higher than the previous models.

If we had more time and resources to run it on more epochs, we could get so better results :)  

In [4]:
import tensorflow as tf

tf.saved_model.save(model, 'HuggingFace-PreTrained')

In [6]:
model.save_weights('HuggingFace-Weights')

## Huggin Face (DistilBert model and twitter-roberta-base-sentiment-latest model)

### DistilBert

In [3]:
# Import libraries
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df["Tweet Text"], df["Label"].replace(4, 1), test_size=0.1, random_state=42)

# Load the pre-trained DistilBERT model and tokenizer
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize the texts and convert them to tensors
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

# Fine-tune the model on the train set
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
model.fit(train_dataset.shuffle(1000).batch(16), epochs=5, batch_size=16)

# Evaluate the model on the test set
model.evaluate(test_dataset.batch(16))


2024-02-02 12:35:28.063181: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 12:35:28.063279: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 12:35:28.186409: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Epoch 1/5


I0000 00:00:1706877428.561178     107 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.8243535161018372, 0.7844383716583252]

### Tweeter Roberta

In [23]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
y_pred = []
for i in list(X_test):
    encoded_input = tokenizer(i, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    if scores[2] >= scores[0]:
        y_pred.append(1)
    else:
        y_pred.append(0)
    
accuracy = accuracy_score(y_test, y_pred)
print("The accuracy of twitter-roberta-base-sentiment-latest model from Huggung Face on test set is:", accuracy)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The accuracy of twitter-roberta-base-sentiment-latest model from Huggung Face on test set is: 0.7336764557473004


According to the above results, my inference is:

- The BoW and TF-IDF models are based on the frequency of words in the tweets, and do not capture the context or the meaning of the words. The Pre-trained BERT and Roberta models are based on pre-trained language models that can capture the context and the meaning of the words, and are fine-tuned on the tweet dataset.
- The BoW and TF-IDF models have similar performance, with TF-IDF slightly outperforming BoW. This may be because TF-IDF assigns more weight to the words that are more informative and less common, while BoW assigns equal weight to all words. The Pre-trained BERT and Roberta models have higher performance than the BoW and TF-IDF models, with Pre-trained BERT slightly outperforming Roberta. This may be because Pre-trained BERT and Roberta can learn from a large corpus of text, and can handle the complexity and the variability of the tweets better than the BoW and TF-IDF models.
- The BoW and TF-IDF models are simpler and faster to train and evaluate than the Pre-trained BERT and Roberta models. The Pre-trained BERT and Roberta models are more complex and require more computational resources and time to train and evaluate. The BoW and TF-IDF models may be more suitable for tasks where the data is small and simple, while the Pre-trained BERT and Roberta models may be more suitable for tasks where the data is large and complex.
- The BoW and TF-IDF models may make mistakes when the tweets contain sarcasm, irony, negation, or slang, as they may not be able to detect the tone or the intention of the tweets. The Pre-trained BERT and Roberta models may make mistakes when the tweets contain domain-specific terms, abbreviations, or hashtags, as they may not be familiar with the vocabulary or the style of the tweets.