# Project: Sentiment Analysis on Predoct Reviews

[Dataset name (Women’s Clothing E-Commerce Reviews)](https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews)

## Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertForSequenceClassification
)
import matplotlib.pyplot as plt
import kagglehub

  if not hasattr(np, "object"):


# Config/Hyper Params

In [None]:
# Model
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 3

# Training
BATCH_SIZE = 16
EPOCHS = 6
LEARNING_RATE = 2e-5

# Tokenization
MAX_LEN = 128

# Random seed
SEED = 42

## Load the Dataset

In [3]:
path = kagglehub.dataset_download("nicapotato/womens-ecommerce-clothing-reviews")

df = pd.read_csv(path + '/Womens Clothing E-Commerce Reviews.csv')

df.head()


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


## Preprocessing

In [5]:
df = df.drop(columns=["Unnamed: 0", "Clothing ID", "Age", "Positive Feedback Count", \
"Division Name", "Department Name", "Class Name"]) #removing all unecessary data
#df = df.drop(columns=["Recommended IND"]) #If conclusively not needed

In [6]:
#df = df.dropna(subset=["Review Text"])
df = df.dropna(subset=["Title", "Review Text"], how='all') #To filter out completely empty reviews
df = df[(df["Review Text"].str.strip() != "") & (df["Title"].str.strip() != "")] #In case both are whitespace

In [7]:
#df["text"] = df["Title"].fillna("") + ". " + df["Review Text"]
df["text"] = df["Title"].fillna("").str.strip() + ". " + df["Review Text"].fillna("").str.strip() #In case we decide to only throw out empty reviews

In [8]:
def rating_to_sentiment(r):
    if r <= 2:
        return 0  # negative
    elif r == 3:
        return 1  # neutral
    else:
        return 2  # positive

df["sentiment"] = df["Rating"].apply(rating_to_sentiment)
df["sentiment"].value_counts()


sentiment
2    17449
1     2823
0     2370
Name: count, dtype: int64

In [9]:
#print(df[df["text"].str.split().str.len() < 5]) #Checking what we're missing
df = df[df["text"].str.split().str.len() >= 5] #Remove short reviews

In [10]:
df[["text", "sentiment"]].head()

Unnamed: 0,text,sentiment
0,. Absolutely wonderful - silky and sexy and co...,2
1,. Love this dress! it's sooo pretty. i happe...,2
2,Some major design flaws. I had such high hopes...,1
3,"My favorite buy!. I love, love, love this jump...",2
4,Flattering shirt. This shirt is very flatterin...,2


In [11]:
df["sentiment"].value_counts(normalize=True)

sentiment
2    0.770580
1    0.124696
0    0.104724
Name: proportion, dtype: float64

In [12]:
df[["text", "sentiment"]].head()
df["sentiment"].value_counts(normalize=True)


sentiment
2    0.770580
1    0.124696
0    0.104724
Name: proportion, dtype: float64

## Train / Validation / Test Split

In [13]:
X = df["text"].values
y = df["sentiment"].values

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=SEED,
    stratify=y
)

In [15]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)


In [16]:
def show_distribution(name, labels):
    unique, counts = np.unique(labels, return_counts=True)
    print(name, dict(zip(unique, counts)))

show_distribution("Train", y_train)
show_distribution("Validation", y_val)
show_distribution("Test", y_test)


Train {np.int64(0): np.int64(1896), np.int64(1): np.int64(2257), np.int64(2): np.int64(13951)}
Validation {np.int64(0): np.int64(237), np.int64(1): np.int64(282), np.int64(2): np.int64(1744)}
Test {np.int64(0): np.int64(237), np.int64(1): np.int64(283), np.int64(2): np.int64(1744)}


## tokenizer + model skeleton

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)


### Tokenizing

In [18]:
def tokenize(text):
    return tokenizer(
        list(text),
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="tf"
    )

train_encodings = tokenize(X_train)
val_encodings = tokenize(X_val)
test_encodings = tokenize(X_test)

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


### TensorFlow Datasets

In [19]:
train_ds = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(BATCH_SIZE)

val_ds = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
)).batch(BATCH_SIZE)

test_ds = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(BATCH_SIZE)


### Load Model

In [20]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    use_safetensors=False
)

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-

### Compile the Model (Fine-Tuning Setup)

In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)




### Fine-Tune the Model

In [None]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)


Epoch 1/6


### Evaluate on the Test Set - One time

In [None]:
test_loss, test_acc = model.evaluate(test_ds)
print("Test accuracy:", test_acc)

Test accuracy: 0.8568904399871826


### Save the Model

In [None]:
model.save_pretrained("distilbert_sentiment_model")
tokenizer.save_pretrained("distilbert_sentiment_model")

('distilbert_sentiment_model/tokenizer_config.json',
 'distilbert_sentiment_model/special_tokens_map.json',
 'distilbert_sentiment_model/vocab.txt',
 'distilbert_sentiment_model/added_tokens.json',
 'distilbert_sentiment_model/tokenizer.json')

### Prediction



In [None]:
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="tf",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    )

    logits = model(**inputs).logits
    probs = tf.nn.softmax(logits, axis=1).numpy()[0]
    pred = np.argmax(probs)

    return {
        "label": label_map[pred],
        "probabilities": {
            "negative": float(probs[0]),
            "neutral": float(probs[1]),
            "positive": float(probs[2])
        }
    }


### Testing It

In [None]:
predict_sentiment("This product was okay, but nothing special.")


{'label': 'neutral',
 'probabilities': {'negative': 0.27384400367736816,
  'neutral': 0.6741713881492615,
  'positive': 0.05198461562395096}}

In [None]:
predict_sentiment("Absolutely terrible quality. Waste of money.")

{'label': 'negative',
 'probabilities': {'negative': 0.928424596786499,
  'neutral': 0.06643159687519073,
  'positive': 0.005143859889358282}}

In [None]:
predict_sentiment("I love this dress, it fits perfectly.")

{'label': 'positive',
 'probabilities': {'negative': 0.00023257483553607017,
  'neutral': 0.00027741826488636434,
  'positive': 0.9994900226593018}}