---
**In this project, we will build a sentiment analysis using yelp and distillBERT**

---

# **Imports and IDE Configs**

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U

download dataset

In [None]:
!mkdir ~/.kaggle/
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# downnlaod yelp dataset
!kaggle datasets download -d yelp-dataset/yelp-dataset
!unzip yelp-dataset.zip
# delete everything except review file
!find . -type f -not -name 'yelp_academic_dataset_review.json' -delete

import packages

In [None]:
import pandas as pd
import json
import re
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertModel
from datasets import Dataset #hugginface datasets
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# **Load dataset and EDA**

In [None]:
chunk_size = 20000  # adjust based on available memory
dataset_path = "yelp_academic_dataset_review.json"
json_reader = pd.read_json(dataset_path, lines=True, chunksize=chunk_size) # this get all the chunks of json file (as the file is too large, cannot load all at once)
df = next(json_reader) # loading first chunk

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Assuming your DataFrame is named df and has a 'stars' column
df['stars'].value_counts().sort_index().plot(kind='bar', title='Star Ratings Distribution')

In [None]:
temp_text = ''.join(df.text)
# Generate word cloud
wordcloud = WordCloud(width = 800, height = 800,
                      background_color ='white',
                      min_font_size = 10).generate(temp_text)

# Plotting the WordCloud
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

# **Data Processing**

Data Cleaning


In [None]:
# Text cleaning function (customize as needed)
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '[URL]', text)
    # Remove Emails
    text = re.sub(r'\S+@\S+', '[EMAIL]', text)
    # Remove new line and line breaks
    text = text.replace('\n', ' ').replace('\r', '').strip()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Optional: Lowercase (Depends on whether your DistilBERT model is cased or not)
    # text = text.lower()

    return text

In [None]:
# filtering
df.drop(columns=['review_id','user_id','business_id'],inplace=True)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
# Clean the 'text' column
df['clean_text'] = df['text'].apply(clean_text)

Data Feature Engineering

In [None]:
# Ordinal encoding
# as ordinal encoding is expected to be 0-n but stars were 1-5. so reducing by 1.
df['stars'] = df['stars'] - 1 # feature engineering

Preprocessing

In [None]:
# validation splitting
train_df, val_df = train_test_split(df, test_size=0.2)

In [None]:
# model needs int64 data and this stars originally were float.
train_df['stars'] = train_df['stars'].astype(int)
val_df['stars'] = val_df['stars'].astype(int)

In [None]:
# preparing huggingface datsets with expected column names (for the tokenizer and model)
train_data = Dataset.from_pandas(train_df[['text', 'stars']].rename(columns={'text': 'text', 'stars': 'label'}))
val_data = Dataset.from_pandas(val_df[['text', 'stars']].rename(columns={'text': 'text', 'stars': 'label'}))

# **Model Loading and Training**

In [None]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [None]:
# tokenize dataset
train_data_tokenized = train_data.map(tokenize_function, batched=True)
val_data_tokenized = val_data.map(tokenize_function, batched=True)

In [None]:
# hugging face's this model is in torch, so we need to convert the type to torch dataset.
# Format for pytorch
train_data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=2,  # Example value
    logging_steps=100  # Reduced logging frequency
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(train_data_tokenized['label'].unique()))

In [None]:
trainer = Trainer( #hugggingface trainer module takes care of parallel trainig.
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=val_data_tokenized
)

In [None]:
num_labels = len(train_data_tokenized['label'].unique())
model_config_num_labels = model.config.num_labels

print(f"Dataset num_labels: {num_labels}")
print(f"Model config num_labels: {model_config_num_labels}")

In [None]:
trainer.train()

In [None]:
#save to s3
model.save_pretrained("./models/")

# **inference**

In [None]:
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

In [None]:
test_text = ["food is awesome!","It was best","I hate it"]
test_encodings = tokenizer(test_text, padding=True, truncation=True, return_tensors='pt')
test_dataset = Dataset.from_dict(test_encodings)
test_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

predictions = trainer.predict(test_dataset)

import torch
import torch.nn.functional as F

logits = torch.tensor(predictions.predictions)
probabilities = F.softmax(logits, dim=0)
print(probabilities)

In [None]:
print(predictions.predictions[0]),predictions.predictions[1]

In [None]:
predictions.predictions[2]

In [None]:
predicted_label_idx = torch.argmax(predictions.predictions[0]).item()