In [1]:
!pip install tweepy transformers pandas scikit-learn tensorflow




In [2]:
!pip install opendatasets -q


In [3]:
import opendatasets as od


In [4]:
# download dataset
od.download("https://www.kaggle.com/datasets/kazanova/sentiment140")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: kaveeshashehani
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
Downloading sentiment140.zip to ./sentiment140


100%|██████████| 80.9M/80.9M [00:00<00:00, 174MB/s]





In [5]:
import pandas as pd

# Load the dataset
data_path = "./sentiment140/training.1600000.processed.noemoticon.csv"
data = pd.read_csv(data_path, encoding='latin-1', header=None)

# Assign column names
data.columns = ['sentiment', 'id', 'date', 'query', 'username', 'text']

# Map sentiments (0 = negative, 4 = positive)
data['sentiment'] = data['sentiment'].replace({0: 0, 4: 1})

# Keep only relevant columns
data = data[['text', 'sentiment']]

# Display the first few rows
data.head()


Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [6]:
import re

# Function to clean text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.lower()

# Apply cleaning
data['cleaned_text'] = data['text'].apply(clean_text)

# Check cleaned data
data[['cleaned_text', 'sentiment']].head()


Unnamed: 0,cleaned_text,sentiment
0,a thats a bummer you shoulda got david car...,0
1,is upset that he cant update his facebook by t...,0
2,i dived many times for the ball managed to sa...,0
3,my whole body feels itchy and like its on fire,0
4,no its not behaving at all im mad why am i he...,0


In [7]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'], data['sentiment'], test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 1280000
Testing samples: 320000


In [23]:
from transformers import BertTokenizer
import tensorflow as tf

# Load the BERT tokenizer to preprocess text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to encode text data into a format BERT understands
def encode_data(texts, labels, tokenizer, max_length=128):
    # Tokenize the text: converts text into token IDs, pads/truncates to max_length, and returns tensors
    encodings = tokenizer(
        list(texts),                # Input text data
        truncation=True,            # Truncate longer texts to max_length
        padding=True,               # Pad shorter texts to max_length
        max_length=max_length,      # Maximum sequence length
        return_tensors='tf'         # Output format: TensorFlow tensors
    )
    # Combine tokenized inputs and labels into a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    return dataset

# Prepare the training dataset (tokenized and batched for efficiency)
train_dataset = encode_data(X_train, y_train, tokenizer).batch(16)

# Prepare the testing dataset (tokenized and batched for evaluation)
test_dataset = encode_data(X_test, y_test, tokenizer).batch(16)



In [24]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',  # BERT model (lowercase English)
    num_labels=2          # Number of output classes (binary classification: 0 = Negative, 1 = Positive)
)

# Compile the model with settings for optimization and evaluation
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  # Adam optimizer with a small learning rate for fine-tuning
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Loss function for multi-class classification
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]  # Evaluation metric to monitor accuracy during training

# Combine the components to prepare the model for training
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

print("Model successfully defined and compiled!")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model successfully defined and compiled!


In [10]:
# Randomly sample a subset of the data
sampled_data = data.sample(n=10000, random_state=42)

# Reset the index of the sampled data
sampled_data = sampled_data.reset_index(drop=True)

# Prepare X and y for the sampled data
X_sample = sampled_data['cleaned_text']
y_sample = sampled_data['sentiment']


In [11]:
from sklearn.model_selection import train_test_split

# Split the sampled data
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 8000
Testing samples: 2000


In [12]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Tokenize the sampled data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, labels, tokenizer, max_length=128):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    return dataset

train_dataset = encode_data(X_train, y_train, tokenizer).batch(16)
test_dataset = encode_data(X_test, y_test, tokenizer).batch(16)

# Define and compile the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train the model
model.fit(train_dataset, validation_data=test_dataset, epochs=3)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b2b6427caf0>

In [13]:
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")


Loss: 0.6485780477523804
Accuracy: 0.7835000157356262


In [14]:
# Example custom input
custom_text = ["I love this product!", "I hate this service."]

# Tokenize the input
custom_encodings = tokenizer(custom_text, truncation=True, padding=True, max_length=128, return_tensors='tf')

# Make predictions
predictions = model.predict(custom_encodings)
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

print(f"Predicted Labels: {predicted_labels}")
# 0 = Negative, 1 = Positive


Predicted Labels: [1 0]


In [15]:
model.save("sentiment_model")
print("Model saved in the directory 'sentiment_model'.")


Model saved in the directory 'sentiment_model'.


In [16]:
model.save_pretrained("sentiment_model_hf")
tokenizer.save_pretrained("sentiment_model_hf")
print("Model and tokenizer saved in 'sentiment_model_hf'.")


Model and tokenizer saved in 'sentiment_model_hf'.


In [17]:
!pip install ipywidgets


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [22]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

# Load the model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("sentiment_model_hf")
tokenizer = BertTokenizer.from_pretrained("sentiment_model_hf")

def predict_sentiment(text):
    encodings = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='tf')
    predictions = model.predict(encodings)
    probabilities = tf.nn.softmax(predictions.logits, axis=-1).numpy()
    predicted_label = tf.argmax(predictions.logits, axis=1).numpy()[0]
    confidence = probabilities[0][predicted_label]
    sentiment = "Negative" if predicted_label == 0 else "Positive" if predicted_label == 1 else "Neutral"
    return f"{sentiment} (Confidence: {confidence:.2f})"


# Input widget
text_input = widgets.Textarea(
    value="",
    placeholder="Type your text here...",
    layout=widgets.Layout(width="80%", height="150px", margin="10px", resize="both")
)

# Styled label for input
input_label = widgets.HTML("<h3 style='text-align: center; color: #4CAF50;'>Sentiment Analysis Tool</h3>")

# Output widget with styling
output = widgets.Output(layout=widgets.Layout(border="1px solid #ccc", padding="10px", margin="10px", width="80%"))
with output:
    clear_output(wait=True)
    display(widgets.HTML("<p style='color: #4CAF50;'>Processing...</p>"))

# Styled button to trigger prediction
button = widgets.Button(
    description="Analyze Sentiment",
    button_style="success",  # Options: 'primary', 'success', 'info', 'warning', 'danger'
    tooltip="Click to analyze sentiment",
    icon="check-circle"  # FontAwesome icon
)

# Footer for instructions
footer = widgets.HTML(
    "<p style='text-align: center; color: #777; font-size: 12px;'>"
    "Enter text and click the button to classify sentiment as Positive, Neutral, or Negative."
    "</p>"
)

# Function to handle button click
def on_button_click(b):
    with output:
        clear_output(wait=True)
        user_input = text_input.value.strip()
        if user_input:
            sentiment = predict_sentiment(user_input)
            display(widgets.HTML(f"<h4 style='color: #4CAF50;'>Sentiment: {sentiment}</h4>"))
        else:
            display(widgets.HTML("<p style='color: red;'>Please enter some text!</p>"))

button.on_click(on_button_click)

# Arrange widgets in a vertical box layout
ui = widgets.VBox([input_label, text_input, button, output, footer])

# Display the interface
display(ui)


Some layers from the model checkpoint at sentiment_model_hf were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at sentiment_model_hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


VBox(children=(HTML(value="<h3 style='text-align: center; color: #4CAF50;'>Sentiment Analysis Tool</h3>"), Tex…