In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Import libraries for feature extraction, data splitting, and evaluation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# Import necessary libraries for BERT model and tokenizer
from transformers import BertTokenizer, TFBertForSequenceClassification
from datasets import Dataset
import tensorflow as tf

# Import models and preprocessing tools
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Load the spaCy library for advanced text preprocessing
import spacy




In [2]:
# Load the Twitter dataset
data = pd.read_csv("..\Data\TwitterData/twitter_data.csv")

In [3]:
# Display dataset information for understanding its structure
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75682 entries, 0 to 75681
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweetID        75682 non-null  int64 
 1   entity         75682 non-null  object
 2   sentiment      75682 non-null  object
 3   tweet_content  74996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
# Display the first few rows of the dataset for preview
data.head()

Unnamed: 0,tweetID,entity,sentiment,tweet_content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
# Print the shape of the dataset (rows and columns)
data.shape

(75682, 4)

In [7]:
# Drop duplicate rows to ensure uniqueness
data.drop_duplicates(inplace=True)

# Drop rows with missing values to handle null data
data.dropna(inplace=True)

In [8]:
# Load the small English spaCy model for text preprocessing
nlp = spacy.load("en_core_web_sm") 

# Define a preprocessing function to clean and lemmatize text
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        # Exclude stopwords and punctuation
        if token.is_stop or token.is_punct:
            continue
        # Append lemmatized version of the token
        filtered_tokens.append(token.lemma_)
    
    # Join tokens into a single string
    return " ".join(filtered_tokens)

In [9]:
# Apply the preprocessing function to the 'tweet_content' column
data['Preprocessed Text'] = data['tweet_content'].apply(preprocess) 

# Initialize the LabelEncoder for encoding target labels (sentiments)
le = LabelEncoder()

# Encode the 'sentiment' column into numeric values
data['sentiment'] = le.fit_transform(data['sentiment'])

In [10]:
data['Preprocessed Text'] = data['Preprocessed Text'].astype(str)  # Convert all to string
data = data[data['Preprocessed Text'].str.strip() != '']  # Drop empty strings

In [11]:
# Sample the dataset to balance class representation and reduce size
data = data.groupby('sentiment', group_keys=False).apply(lambda x: x.sample(frac=5000/75682, random_state=42))

# Separate features (text) and target (sentiment) columns
X = data["Preprocessed Text"]
y = data["sentiment"]

In [12]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary set into validation (50%) and test (50%) to get 10% each
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [13]:
# Print the lengths of the splits to ensure proper division
print(f"Length of X_train: {len(X_train)}, Length of y_train: {len(y_train)}")
print(f"Length of X_val: {len(X_val)}, Length of y_val: {len(y_val)}")
print(f"Length of X_test: {len(X_test)}, Length of y_test: {len(y_test)}")

Length of X_train: 3724, Length of y_train: 3724
Length of X_val: 466, Length of y_val: 466
Length of X_test: 466, Length of y_test: 466


In [14]:
# Load the BERT tokenizer for encoding text into numerical format
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [15]:
# Set the maximum token length for truncation and padding
max_len = 128

# Tokenize and encode the training, validation, and test data
X_train_encoded = tokenizer.batch_encode_plus(X.tolist(),
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')

X_val_encoded = tokenizer.batch_encode_plus(X_val.tolist(), 
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')

X_test_encoded = tokenizer.batch_encode_plus(X_test.tolist(), 
											padding=True, 
											truncation=True,
											max_length = max_len,
											return_tensors='tf')


In [16]:
# Load the pre-trained BERT model for sequence classification with 4 labels (sentiments)
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Compile the model with Adam optimizer, sparse categorical cross-entropy loss, and accuracy metric
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [20]:
# Train the model on the training data and validate on the validation set
history = model.fit(
	[X_train_encoded['input_ids'], X_train_encoded['token_type_ids'], X_train_encoded['attention_mask']],
	y,
	validation_data=(
	[X_val_encoded['input_ids'], X_val_encoded['token_type_ids'], X_val_encoded['attention_mask']],y_val),
	batch_size=32,
	epochs=1
)




In [21]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(
    [X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']],
    y_test
)

# Extract training and validation accuracies from the history object
train_accuracy = history.history['accuracy'][-1]
val_accuracy = history.history['val_accuracy'][-1]

# Print evaluation results
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')
print(f'Training accuracy: {train_accuracy}, Validation accuracy: {val_accuracy}')



Test loss: 0.9238451719284058, Test accuracy: 0.6566523313522339
Training accuracy: 0.531572163105011, Validation accuracy: 0.6459227204322815


In [22]:
# Generate predictions on the test data
logits = model.predict([X_test_encoded['input_ids'], X_test_encoded['token_type_ids'], X_test_encoded['attention_mask']]).logits



In [23]:
# Apply softmax to logits to get probabilities for each class
y_pred_probs = tf.nn.softmax(logits, axis=-1).numpy()

# Convert probabilities to predicted labels
if y_pred_probs.shape[1] == 2:  # Binary classification
    y_pred = (y_pred_probs[:, 1] > 0.5).astype(int)
else:  # Multi-class classification
    y_pred = np.argmax(y_pred_probs, axis=1)

# Convert test labels to numpy array for compatibility
y_test = np.array(y_test)

# Print classification metrics
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Precision: 0.6529321843907155
Recall: 0.6566523605150214
F1 Score: 0.62117296794008

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.18      0.28       100
           1       0.71      0.87      0.78       143
           2       0.60      0.63      0.62       102
           3       0.64      0.83      0.72       121

    accuracy                           0.66       466
   macro avg       0.65      0.63      0.60       466
weighted avg       0.65      0.66      0.62       466



In [24]:
# Function for user-input prediction
def predict_sentiment(statement):
    processed_statement = preprocess(statement)  # Preprocess user input
    encoded_statement = tokenizer.encode_plus(
        processed_statement, max_length=max_len, padding='max_length', truncation=True, return_tensors='tf')
    logits = model.predict([encoded_statement['input_ids'], encoded_statement['token_type_ids'], encoded_statement['attention_mask']]).logits
    probabilities = tf.nn.softmax(logits, axis=-1).numpy()
    prediction = np.argmax(probabilities, axis=1)[0]  # Get the predicted label
    sentiment_label = le.inverse_transform([prediction])[0]  # Decode label to original sentiment
    print(f"Input Statement: {statement}")
    print(f"Predicted Sentiment: {sentiment_label} (Confidence: {probabilities[0][prediction]:.2f})")

# Example usage for user input
user_input = input("Enter a statement to predict its sentiment: ")
predict_sentiment(user_input)

Input Statement: hello
Predicted Sentiment: Positive (Confidence: 0.52)
