In [16]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
import tensorflow as tf

In [None]:
#Load restaurant reviews dataset into pandas DataFrame
data = pd.read_csv('/Restaurant_Reviews.csv', header=None, names=['Review', 'Liked'])
data = data.dropna(subset=['Review', 'Liked'])  #Remove rows where either the review text or label is missing

#Map common string labels to integers
label_mapping = {
    'positive': 1, 'Positive': 1, '1': 1, 1: 1,
    'negative': 0, 'Negative': 0, '0': 0, 0: 0
}
data['Liked'] = data['Liked'].map(label_mapping)

#Remove rows with unmapped (invalid) labels
data = data.dropna(subset=['Liked'])

data['Liked'] = data['Liked'].astype('int32')

#Ensure binary labels
data = data[data['Liked'].isin([0, 1])]

print("Class distribution after cleaning:")
print(data['Liked'].value_counts())

In [None]:
#Check structure of dataset
data.info()

In [19]:
#Separate features (X) and labels (y) for model training
reviews = data['Review'].values
labels = data['Liked'].values  #Binary (0=negative, 1=positive)

In [20]:
#Split dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    reviews, labels, test_size=0.2
)

In [21]:
#Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [22]:
#Function to tokenize and encode text data for DistilBERT
def encode_texts(texts, max_length=100):
    encodings = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    return encodings

#Apply tokenization to training and validation sets
train_encodings = encode_texts(train_texts, max_length=100)
val_encodings = encode_texts(val_texts, max_length=100)

In [23]:
#Convert tokenized data and labels into TensorFlow Dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(16)

In [24]:
#Import model architecture and optimizer for fine-tuning DistilBERT
from transformers import TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam

In [None]:
#Load pre-trained DistilBERT model for binary classification
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

In [26]:
#Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.optimizer.learning_rate = 1e-5  #Set the learning rate

In [None]:
#Fine-tune the pretrained model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=4
)

In [28]:
#Develop function for sentiment analysis
def predict_sentiment(model, tokenizer, texts, max_length=100):
    if isinstance(texts, str):
        texts = [texts]
    
    #Tokenize input texts into tensors
    encodings = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    #Run the model to get output scores (logits)
    logits = model.predict(dict(encodings)).logits

    #Determine probability calculation method based on output shape
    if logits.shape[1] == 1:
        #Sigmoid for single neuron output
        positive_probs = tf.nn.sigmoid(logits).numpy().flatten()
        negative_probs = 1 - positive_probs
        predictions = (positive_probs >= 0.5).astype(int)
    else:
        #Softmax for two neuron output
        probs = tf.nn.softmax(logits, axis=-1).numpy()
        positive_probs = probs[:, 1]
        negative_probs = probs[:, 0]
        predictions = np.argmax(probs, axis=-1)

    label_map = {0: 'Negative', 1: 'Positive'}
    
    results = [
        {
            'text': text,
            'label': label_map[pred],
            'probability': float(positive_probs[i] if pred == 1 else negative_probs[i]),
            'negative_prob': float(negative_probs[i]),
            'positive_prob': float(positive_probs[i])
        }
        for i, (text, pred) in enumerate(zip(texts, predictions))
    ]

    return results

In [29]:
#Test strings
test_strings = [
    "The food was good"
]

#Predict results
results = predict_sentiment(model, tokenizer, test_strings)

#Print results
for result in results:
    print(f"Text: {result['text']}")
    print(f"Predicted Label: {result['label']}")
    print(f"Probability: {result['probability']:.4f}")
    print(f"Negative Prob: {result['negative_prob']:.4f}, Positive Prob: {result['positive_prob']:.4f}")
    print()

Text: The food was good
Predicted Label: Positive
Probability: 0.5280
Negative Prob: 0.4720, Positive Prob: 0.5280

