In [1]:
# !pip install -r requirements.txt

In [2]:
import pandas as pd

#read the book data from json file
books_df = pd.read_json('Books_small_10000.json', lines=True)
print('Number of rows  books: ', len(books_df))
#print unique values of the column 'overall'
print('Unique values of the column overall: ', books_df['overall'].unique())
books_df = books_df[['reviewText', 'overall']]

books_df.head()

Number of rows  books:  10000
Unique values of the column overall:  [5 3 4 2 1]


Unnamed: 0,reviewText,overall
0,"I bought both boxed sets, books 1-5. Really a...",5
1,I enjoyed this short book. But it was way way ...,3
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4
3,I really enjoyed this adventure and look forwa...,4
4,It was a decent read.. typical story line. Not...,3


In [3]:
#read the Restaurant data from tsv file
restaurants_df = pd.read_csv('reviews.csv')
print('Number of rows in restaurants dataset: ', len(restaurants_df))
print('Unique values in Liked column: ', restaurants_df['Review'].unique())
#drop the rows with missing values NaN values at Review  column
restaurants_df = restaurants_df.dropna(subset=['Review'])

#drop the Recommends column

restaurants_df = restaurants_df.drop(['Recommends'], axis=1)

restaurants_df.head()

Number of rows in restaurants dataset:  16597
Unique values in Liked column:  [nan  5.  4.  3.  1.  2.]


Unnamed: 0,Review Text,Review
2,The man who is foodie like me for him arabian ...,5.0
4,This place is too much comfortable & food is d...,4.0
6,I check it out like a second home of mine...fe...,3.0
8,"you guys are awesome & I just love your ""offer...",5.0
10,Went there after referred by a friend. Tried t...,5.0


In [4]:
import sqlite3
 
#read the movies data from db file
connect = sqlite3.connect('IMDB_Movies_2021.db')

query = 'SELECT REVIEW,RATING FROM REVIEWS'
movies_df = pd.read_sql_query(query,connect)

#print the number of rows 
print('Number of rows in books dataset: ', len(books_df))
#print the unique values in the sentiment column
print('Unique values in sentiment column: ', movies_df['RATING'].unique())

#remove rows with nan values in RATING column
movies_df = movies_df.dropna(subset=['RATING'])

#rescale the ratings between 0 and 5 and rouding up to integer
movies_df['RATING'] = round(movies_df['RATING'] * (5/10))

print('Unique values in sentiment column: ', movies_df['RATING'].unique())

movies_df.head()

Number of rows in books dataset:  10000
Unique values in sentiment column:  [ 5.  8.  4.  6.  9.  7.  3.  1.  2. nan 10.]
Unique values in sentiment column:  [2. 4. 3. 0. 1. 5.]


Unnamed: 0,REVIEW,RATING
0,I don't get all the terrible reviews for this ...,2.0
1,I cannot believe anyone could give this film l...,4.0
2,Great White is not the worst way to spend 90 m...,2.0
3,Great White is as basic of a killer shark film...,2.0
4,"Terrible story, dialogue and CGI. The film has...",2.0


In [5]:
#create a pie chart to show the distribution of the data use plotly

import plotly.graph_objects as go

labels = ['Books', 'Restaurants', 'Movies']
values = [len(books_df), len(restaurants_df), len(movies_df)]


fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

#create and center the title
fig.update_layout(title_text='Distribution of the data', title_x=0.5)

#update the legend position to be in the middle below the title
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="center",
    x=0.5
))


fig.show()

In [6]:
# drop all the column in the books_df except the reviewText and overall columns
# reviewerID	asin	reviewerName	helpful	reviewText	overall	summary	unixReviewTime	reviewTime


def convert_rating_to_sentiment(rating):
    
    #conv to int
    rating = int(rating)

    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'


In [7]:

#convert the ratings to sentiment
books_df['overall'] = books_df['overall'].apply(convert_rating_to_sentiment)
restaurants_df['Review'] = restaurants_df['Review'].apply(convert_rating_to_sentiment)
movies_df['RATING'] = movies_df['RATING'].apply(convert_rating_to_sentiment)

In [8]:

#combine the dataframes by stacking them on top of each other with new header name review and sentiment

books_df = books_df.rename(columns={'reviewText': 'review', 'overall': 'sentiment'})
restaurants_df = restaurants_df.rename(columns={'Review Text': 'review', 'Review': 'sentiment'})
movies_df = movies_df.rename(columns={'REVIEW': 'review', 'RATING': 'sentiment'})

#add a new column to the dataframes to identify the source of the data
books_df['source'] = 'books'
restaurants_df['source'] = 'restaurants'
movies_df['source'] = 'movies'

#combine the dataframes

df_comb = pd.concat([books_df, restaurants_df, movies_df], ignore_index=True)
df_comb.head()

Unnamed: 0,review,sentiment,source
0,"I bought both boxed sets, books 1-5. Really a...",positive,books
1,I enjoyed this short book. But it was way way ...,neutral,books
2,I love Nicholas Sparks. I&#8217;ve read everyt...,positive,books
3,I really enjoyed this adventure and look forwa...,positive,books
4,It was a decent read.. typical story line. Not...,neutral,books


In [9]:
#save the combined dataframe to csv file
df_comb.to_csv('combined_train_sentiment.csv', index=False)

In [10]:
#visualize the df_comb dataframe using plotly and bar chart

import plotly.express as px

#create a bar chart to show the distribution of sentiment and source
fig = px.histogram(df_comb, x="sentiment", color="source", barmode="group", title='Distribution of sentiment and source', text_auto=True)

#create a smaller figure
fig.update_layout(height=400, width=600)


fig.show()

In [11]:
#import the test set from csv file
test_df = pd.read_csv('sentiment-topic-final-test.tsv', sep='\t')

#take only the text and sentiment columns
test_df = test_df[['text', 'sentiment']]

#rename the column text to review
test_df = test_df.rename(columns={'text': 'review'})


In [12]:
#look for nan values in the test_df
test_df.isna().sum()


review       0
sentiment    0
dtype: int64

In [13]:

#drop the source column from the df_comb
df_comb = df_comb.drop(['source'], axis=1)


In [14]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mblackcerberus[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
df_comb

Unnamed: 0,review,sentiment
0,"I bought both boxed sets, books 1-5. Really a...",positive
1,I enjoyed this short book. But it was way way ...,neutral
2,I love Nicholas Sparks. I&#8217;ve read everyt...,positive
3,I really enjoyed this adventure and look forwa...,positive
4,It was a decent read.. typical story line. Not...,neutral
...,...,...
30005,"It's master piece by Zack please part 2,3,4 al...",positive
30006,No words to describe. It's awesome. One of the...,positive
30007,Far better than previous one and better editin...,positive
30008,Why did the studio say no to this masterpiece?...,positive


In [16]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import random
import pytorch_lightning as pl
import torch
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


#MODELS
MODELS = {
    'AlBERT': (BertForSequenceClassification, BertTokenizer, 'bert-base-uncased', BertConfig),
    'BERT': (BertForSequenceClassification, BertTokenizer, 'bert-base-uncased', BertConfig),
    'DistilBERT': (DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-uncased', DistilBertConfig),
    'RoBERT' : (BertForSequenceClassification, BertTokenizer, 'bert-base-uncased', BertConfig)
}


# Set the random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

# Map sentiment labels to integers
sentiment_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df_comb['sentiment'] = df_comb['sentiment'].map(sentiment_map)
# Map sentiment labels to integers in test_df
test_df['sentiment'] = test_df['sentiment'].map(sentiment_map)

#make sure review column is string
df_comb['review'] = df_comb['review'].astype(str)
test_df['review'] = test_df['review'].astype(str)

train_df, val_df = train_test_split(df_comb, test_size=0.2, random_state=42)


# Define the model
class SentimentClassifier(pl.LightningModule):
    def __init__(self, model_name_or_path, num_labels=3, learning_rate=2e-5):
        super().__init__()
        self.model = BertForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask=attention_mask)[0]

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == labels).sum().item() / labels.size(0)
        self.log('val_loss', loss)
        self.log('val_accuracy', accuracy)
        return accuracy

    def on_test_epoch_start(self):
        self.predicted_labels = []
        self.true_labels = []

    def on_test_epoch_end(self):
        print("Classification Report:")
        print(classification_report(self.true_labels, self.predicted_labels, target_names=['positive', 'neutral', 'negative']))

    def accumulate_predicted_labels(self, predicted, true_labels):
        self.predicted_labels.extend(predicted.cpu().numpy())
        self.true_labels.extend(true_labels.cpu().numpy())

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == labels).sum().item() / labels.size(0)
        self.log('test_accuracy', accuracy)
        self.accumulate_predicted_labels(predicted, labels)
        return accuracy

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

wandb_logger = WandbLogger(project='sentiment-analysis', log_model="all")
checkpoint_callback = ModelCheckpoint(
    monitor="val_accuracy",
    mode="max",
    save_top_k=1, # Save only the best model
    filename="best_model-{epoch}-{val_accuracy:.4f}",
    verbose=True,
)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', verbose=True)


results = {}
for model_name, (model_class, tokenizer_class, pretrained_weights, config_class) in MODELS.items():
    print(f'Running {model_name} model')
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights, num_labels=3)

    train_tokens = tokenizer.batch_encode_plus(
        train_df['review'].tolist(),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    val_tokens = tokenizer.batch_encode_plus(
        val_df['review'].tolist(),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    test_tokens = tokenizer.batch_encode_plus(
        test_df['review'].tolist(),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Create the data loaders
    batch_size = 32
    train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_df['sentiment'].tolist()))
    val_dataset = TensorDataset(val_tokens['input_ids'], val_tokens['attention_mask'], torch.tensor(val_df['sentiment'].tolist()))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataset = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], torch.tensor(test_df['sentiment'].tolist()))
    test_loader = DataLoader(test_dataset, batch_size=batch_size)


    # model
    model = SentimentClassifier(pretrained_weights, num_labels=3, learning_rate=2e-5)
    
    # training with no gpus
    trainer = pl.Trainer(logger=wandb_logger, callbacks=[checkpoint_callback, early_stopping],  max_epochs=1)
    trainer.fit(model, train_loader, val_loader)

    # save the model
    trainer.save_checkpoint(f"{model_name}.ckpt")

    # test the model
    trainer.test(dataloaders=test_loader)

Running AlBERT model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]