In [29]:
# https://www.kaggle.com/code/rishabh15virgo/nlp-with-pytorch-4-yelp-sentiment-analysis

In [30]:
# %pip install numpy==2.2.1 pandas==2.2.3 scikit-learn==1.6.0
# %pip install Flask

In [31]:
# %pip install torch
# %pip install nltk

In [32]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import pandas as pd
import numpy as np
import re
import nltk
import string
from collections import Counter
from argparse import Namespace
import collections
import os

In [33]:
args = Namespace(
    raw_train_dataset_csv="train.csv",
    raw_test_dataset_csv="test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="reviews_with_splits_lite.csv",
    seed=1337
)

In [34]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
print(len(df_train))
print(len(df_test))

559999
37999


In [35]:
df_train.head()

Unnamed: 0,1,"Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars."
0,2,Been going to Dr. Goldberg for over 10 years. ...
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,2,All the food is great here. But the best thing...
4,1,Wing sauce is like water. Pretty much a lot of...


In [36]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [37]:
# Select 10% of the total data to run the experiment faster
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

## Creating training, validation, and testing splits

In [38]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)
    
final_reviews = pd.DataFrame(final_list)

In [39]:
print(final_reviews.shape)
print(final_reviews.head())

(56000, 3)
   rating                                             review  split
0       1  Terrible place to work for I just heard a stor...  train
1       1  3 hours, 15 minutes-- total time for an extrem...  train
2       1  My less than stellar review is for service.   ...  train
3       1  I'm granting one star because there's no way t...  train
4       1  The food here is mediocre at best. I went afte...  train


## Minimal Data Cleaning

In [40]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

final_reviews.review = final_reviews.review.apply(preprocess_text)

In [41]:
# Mapping positive and negative reviews
mapping_dict = {1 : 'Negative', 2 : 'Positive'}
final_reviews['rating'] = final_reviews['rating'].map(mapping_dict)

In [42]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,Negative,terrible place to work for i just heard a stor...,train
1,Negative,"hours , minutes total time for an extremely s...",train
2,Negative,my less than stellar review is for service . w...,train
3,Negative,i m granting one star because there s no way t...,train
4,Negative,the food here is mediocre at best . i went aft...,train


In [43]:
final_reviews.to_csv("reviews_with_splits.csv", index=False)

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        """
        Args : 
            review_df (pandas.DataFrame) : the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        
        """
        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df.split == "val"]
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df.split == "test"]
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train' : (self.train_df, self.train_size),
                             'val' : (self.val_df, self.validation_size),
                             'test' : (self.test_df, self.test_size)}
        self.set_split('train')
    
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """Load dataset and make a new vectorizer from scratch
        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        return cls(review_df,ReviewVectorizer.from_dataframe(review_df))
    
    def get_vectorizer(self):
        """ returns the vectorizer"""
        return self._vectorizer
    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        review_vector = self._vectorizer.vectorize(row.review)
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {'x_data' : review_vector,
               'y_target' : rating_index}
    
    def get_num_batches(self,batch_size):
        """Given a batch size, return the number of batches in the dataset
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

## Vocabulary
- The first stage in going from text to vectorized minibatch is to map each token to a numerical version of itself. The standard methodology is to have a bijection—a mapping that can be reversed—between the tokens and integers. In Python, this is simply two dictionaries.
- We encapsulate this bijection into a Vocabulary class. The Vocabulary class not only manages this bijection—allowing the user to add new tokens and have the index autoincrement—but also handles a special token called UNK, which stands for “unknown.” By using the UNK token, we can handle tokens at test time that were never seen in training.

In [45]:
class Vocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""
    
    def __init__(self, token_to_idx = None, add_unk = True, unk_token = "<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
    
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}
    
    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.
        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """Retrieve the index associated with the token or the UNK index if token isn't present.
        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary)
            for the UNK functionality
        """
        
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vectorizer
The second stage of going from a text dataset to a vectorized minibatch is to iterate through the tokens of an input data point and convert each token to its integer form. The result of this iteration should be a vector. Because this vector will be combined with vectors from other data points, there is a constraint that the vectors produced by the Vectorizer should always have the same length

In [46]:
class ReviewVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
    
    def vectorize(self, review):
        """Create a collapsed one-hot vector for the review
        Args:
            review (str): the review
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding
        """
        one_hot = np.zeros(len(self.review_vocab), dtype = np.float32)
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_dataframe(cls, review_df, cutoff = 25):
        """Instantiate the vectorizer from the dataset dataframe
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency­based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk = True)
        rating_vocab = Vocabulary(add_unk = False)
        
        # Add ratings
        for rating in sorted(set(set(review_df.rating))):
            rating_vocab.add_token(rating)
            
        #Add top words if count > cutoff
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
                    
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
        
        return cls(review_vocab, rating_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        """Intantiate a ReviewVectorizer from a serializable dictionary
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

    def to_serializable(self):
        """Create the serializable dictionary for caching
        Returns:
            contents (dict): the serializable dictionary
        """
        return {'review_vocab': self.review_vocab.to_serializable(),
        'rating_vocab': self.rating_vocab.to_serializable()}

## Dataloader¶
- The final stage of the text to vectorized­minibatch pipeline is to actually group the vectorized data points. Because grouping into minibatches is a vital part of training neural networks.
- PyTorch provides a built-in class called DataLoader for coordinating the process.
- The DataLoader class is instantiated by providing a PyTorch Dataset (such as the ReviewDataset defined for this example), a batch_size, and a handful of other keyword arguments. The resulting object is a Python iterator that groups and collates the data points provided in the Dataset.

In [47]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## Single layer perceptron classifier
The ReviewClassifier inherits from PyTorch’s Module and creates a single Linear layer with a single output. Because this is a binary classification setting (negative or positive review), this is an appropriate setup. The sigmoid function is used as the final nonlinearity.

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    """ a simple perceptron based classifier"""
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features = num_features,
                            out_features = 1)
        
    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        Args:
            x_in (torch.Tensor): an input data tensor
            x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                                should be false if used with the cross­entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,).
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

## Training routine

### Hyperparameter

In [49]:
args = Namespace(
    # Data and path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='reviews_with_splits.csv',
    save_dir='model_storage/ch3/yelp/',
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=10,
    seed=1337,
    cuda = True
    # Runtime options omitted for space
)

### General utilities

In [50]:
def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

### Instantiating the dataset, model, loss, optimizer and training state

In [51]:
# %pip install torch torchvision torchaudio

In [52]:
import torch.optim as optim
import torch.nn as nn

def make_train_state(args):
    return {'epoch_index' : 0,
            'train_loss' : [],
            'train_acc' : [],
            'val_loss' : [],
            'val_acc' : [],
            'test_loss' : -1,
            'test_acc' : -1}
train_state = make_train_state(args)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print("Using CUDA: {}".format(args.cuda))

# handle dirs
handle_dirs(args.save_dir)


# dataset and vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vectorizer()

# model
classifier = ReviewClassifier(num_features = len(vectorizer.review_vocab))
classifier = classifier.to(args.device)

# Loass and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr = args.learning_rate)

Using CUDA: False


## The training loop

In [53]:
for epoch_index in range(args.num_epochs):
    train_state["epoch_indx"] = epoch_index
    
    # Iterate over training dataset
    
    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split("train")
    batch_generator = generate_batches(dataset, batch_size = args.batch_size,
                                      device = args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        # Training routine is 5 steps
        
        # Step 1 : zero the gradients
        optimizer.zero_grad()
        
        # Step 2 : compute the output
        y_pred = classifier(x_in = batch_dict['x_data'].float())
        
        # Step 3 : compute the loss
        loss = loss_func(y_pred, batch_dict["y_target"].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss)/(batch_index + 1)
        
        # Step 4 : use loss to produce gradients
        loss.backward()
        
        # Step 5 : use optimizer to take gradient step
        optimizer.step()
        
        # -----------------------------------------------
        # Compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict["y_target"])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
        
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    print("Epoch # {0} : Train Loss : {1} Train accuracy : {2}".format(epoch_index, running_loss, running_acc))
    
    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset,
                                        batch_size=args.batch_size,
                                        device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        # Step 1 : compute the output
        y_pred = classifier(x_in = batch_dict["x_data"].float())
        
        # Step 2 : compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
    
    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    print("Epoch # {0} : Val Loss : {1} Val accuracy : {2}".format(epoch_index, running_loss, running_acc))

Epoch # 0 : Train Loss : 0.47917016366727994 Train accuracy : 83.86948529411765
Epoch # 0 : Val Loss : 0.38174391526442303 Val accuracy : 88.06490384615381
Epoch # 1 : Train Loss : 0.3291042098422457 Train accuracy : 90.14246323529407
Epoch # 1 : Val Loss : 0.3077550755097316 Val accuracy : 90.57692307692305
Epoch # 2 : Train Loss : 0.27418795929235557 Train accuracy : 91.65900735294125
Epoch # 2 : Val Loss : 0.2743681410184274 Val accuracy : 91.08173076923077
Epoch # 3 : Train Loss : 0.2432389734617246 Train accuracy : 92.38408905228758
Epoch # 3 : Val Loss : 0.2523406413885264 Val accuracy : 91.89903846153847
Epoch # 4 : Train Loss : 0.22264201715101598 Train accuracy : 92.89470996732037
Epoch # 4 : Val Loss : 0.23959222069153424 Val accuracy : 91.77884615384615
Epoch # 5 : Train Loss : 0.20753908693011291 Train accuracy : 93.28022875816983
Epoch # 5 : Val Loss : 0.22971307222659773 Val accuracy : 92.04326923076924
Epoch # 6 : Train Loss : 0.1957074375031819 Train accuracy : 93.56872

### 모델 export

In [None]:
# import pickle

# # 훈련이 끝난 후 train_state를 pkl 파일로 저장
# with open('restaurants_model.pkl', 'wb') as model_file:
#     pickle.dump(train_state, model_file)
    
# print("Training state has been saved to 'restaurants_model.pkl'.")
    
# with open('vectorizer.pkl', 'wb') as vectorizer_file:
#     pickle.dump(vectorizer, vectorizer_file)

# print("Vectorizer has been saved successfully.")

Training state has been saved to 'restaurants_model.pkl'.
Vectorizer has been saved successfully.


In [None]:
# import pickle

# # 저장된 pkl 파일에서 train_state 불러오기
# with open('restaurants_model.pkl', 'rb') as model_file:
#     train_state = pickle.load(model_file)

# print("Loaded Training State from 'restaurants_model.pkl'.")

# # 로드된 train_state에서 필요한 값 추출
# train_loss = train_state.get('train_loss', 'Not available')
# train_acc = train_state.get('train_acc', 'Not available')
# val_loss = train_state.get('val_loss', 'Not available')
# val_acc = train_state.get('val_acc', 'Not available')

# print(f"Train Loss: {train_loss}, Train Accuracy: {train_acc}")
# print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_acc}")

Loaded Training State from 'restaurants_model.pkl'.
Train Loss: [0.47917016366727994, 0.3291042098422457, 0.27418795929235557, 0.2432389734617246, 0.22264201715101598, 0.20753908693011291, 0.1957074375031819, 0.18617883115227699, 0.17817861489415954, 0.171292120262104], Train Accuracy: [83.86948529411765, 90.14246323529407, 91.65900735294125, 92.38408905228758, 92.89470996732037, 93.28022875816983, 93.56872957516336, 93.86233660130716, 94.08445669934643, 94.27849264705887]
Validation Loss: [0.38174391526442303, 0.3077550755097316, 0.2743681410184274, 0.2523406413885264, 0.23959222069153424, 0.22971307222659773, 0.22254042533727783, 0.21705695642874792, 0.21305890587659976, 0.21087025495675896], Validation Accuracy: [88.06490384615381, 90.57692307692305, 91.08173076923077, 91.89903846153847, 91.77884615384615, 92.04326923076924, 92.34374999999997, 92.37980769230768, 92.43990384615383, 92.33173076923076]


In [None]:
# import pickle
# import torch

# # 훈련이 끝난 후 train_state를 pkl 파일로 저장
# with open('restaurants_model.pkl', 'wb') as model_file:
#     pickle.dump(train_state, model_file)
# print("Training state has been saved to 'restaurants_model.pkl'.")

# # 벡터라이저를 pkl 파일로 저장
# with open('vectorizer.pkl', 'wb') as vectorizer_file:
#     pickle.dump(vectorizer, vectorizer_file)
# print("Vectorizer has been saved successfully.")

# # 훈련 루프 끝난 후, 모델 상태와 훈련 상태 저장
# train_state['epoch_index'] = epoch_index + 1  # 마지막 에포크 번호 기록
# train_state['train_loss'].append(running_loss)  # 훈련 손실 추가
# train_state['train_acc'].append(running_acc)   # 훈련 정확도 추가

# # 모델 상태와 훈련 상태 딕셔너리로 묶어서 저장
# model_state = {
#     'model_state_dict': classifier.state_dict(),  # 모델 가중치
#     'train_state': train_state,  # 훈련 상태 (손실, 정확도 등)
#     'num_features': len(vectorizer.review_vocab),  # 벡터라이저에서 num_features 추가
#     'vectorizer': vectorizer  # 벡터라이저 추가
# }

# # 모델 가중치 및 훈련 상태 파일로 저장
# torch.save(model_state, args.model_state_file)  # 여기서 args.model_state_file에 모델과 훈련 상태를 저장
# print("훈련 완료! 모델 가중치와 훈련 상태 저장 완료!")


Training state has been saved to 'restaurants_model.pkl'.
Vectorizer has been saved successfully.
훈련 완료! 모델 가중치와 훈련 상태 저장 완료!


## Evaluating on Test dataset

In [57]:
dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                    batch_size=args.batch_size,
                                    device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [58]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.217
Test Accuracy: 91.96


## Inference and classifying new datapoints

In [59]:
def predict_rating(review, classifier, vectorizer,decision_threshold=0.5):
    """Predict the rating of a review
    Args:
        review (str): the text of the review
        classifier (ReviewClassifier): the trained model
        vectorizer (ReviewVectorizer): the corresponding vectorizer
        decision_threshold (float): The numerical boundary which
                                    separates the rating classes
    """
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result = classifier(vectorized_review.view(1, -1))
    probability_value = F.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0
    return vectorizer.rating_vocab.lookup_index(index)
test_review = "this is a pretty awesome book"
prediction = predict_rating(test_review, classifier, vectorizer)
print("{} -> {}".format(test_review, prediction))

this is a pretty awesome book -> Positive


## Inspecting model weights

In [60]:
# Sort weights
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()
# Top 20 words
print("Influential words in Positive Reviews:")
print("­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Positive Reviews:
­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­
delicious
amazing
great
fantastic
excellent
awesome
vegas
perfect
love
pleasantly
yum
yummy
wonderful
best
ngreat
favorite
reasonable
loved
solid
definitely


In [61]:
# Top 20 negative words
print("Influential words in Negative Reviews:")
print("­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Negative Reviews:
­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­
worst
mediocre
bland
horrible
rude
terrible
awful
meh
overpriced
tasteless
disgusting
disappointing
poor
ok
not
dirty
poorly
elsewhere
disappointment
unfriendly


## Summary
In this part, we learned some foundational concepts of supervised neural network training. We covered:

- The simplest of neural network models, the perceptron
- In the context of a toy example, the training loop, batch sizes, and epochs
- What generalization means, and good practices to measure generalization performance using training/test/validation splits
- Early stopping and other criteria to determine the termination or convergence of the training algorithm
- What hyperparameters are and a few examples of them, such as the batch size, the learning rate, and so on
- How to classify Yelp restaurant reviews in English using the perceptron model implemented in PyTorch, and how to interpret the model by examining its weights

In [62]:
# test_data = final_reviews[final_reviews['split'] == 'test']

# top_20_positive_indices = indices[:20]  # 긍정적인 리뷰에서 영향력 있는 상위 20개 단어 인덱스
# top_20_negative_indices = indices[-20:]  # 부정적인 리뷰에서 영향력 있는 하위 20개 단어 인덱스

# top_20_positive_words = [vectorizer.review_vocab.lookup_index(idx) for idx in top_20_positive_indices]

# restaurant_scores = {}

# for idx, row in test_data.iterrows():  # test 데이터에서 각 리뷰 순회
#     review = row['review']  # 리뷰 텍스트
#     rating = row['rating']  # 레이블 (Positive / Negative)

#     score = 0
#     review_tokens = review.split()  # 공백을 기준으로 리뷰 단어 분리
#     for word in top_20_positive_words:
#         score += review_tokens.count(word)  # 상위 20개 단어가 등장할 때마다 점수 추가

#     if rating == 'Positive':  # 긍정적인 리뷰에 대해서만 계산
#         restaurant_scores[idx] = score

# sorted_reviews = sorted(restaurant_scores.items(), key=lambda x: x[1], reverse=True)

# print("Reviews with the most influential positive words:")
# for rank, (review_idx, score) in enumerate(sorted_reviews, start=1):
#     print(f"Rank {rank}: Review {review_idx} with score {score}")

In [63]:
import random
import string

# 랜덤한 단어를 생성하는 함수
def generate_random_word(length=5):
    return ''.join(random.choices(string.ascii_lowercase, k=length)).capitalize()

# 레스토랑 이름을 랜덤하게 생성하는 함수
def generate_restaurant_name():
    pattern = random.choice([
        "맛집 {num}",              # 예: 맛집 12345
        "The {word} House",       # 예: The Sushi House
        "{word} Bistro",          # 예: Sushi Bistro
        "King of {word}",         # 예: King of Pizza
        "{word} Garden",          # 예: Sushi Garden
        "{word} Palace",          # 예: Pizza Palace
    ])

    if '{num}' in pattern:
        num = random.randint(10000, 99999)  # 5자리 숫자
        return pattern.format(num=num)
    elif '{word}' in pattern:
        word = generate_random_word()  # 랜덤 단어
        return pattern.format(word=word)

# 1. test 데이터셋만 필터링
test_data = final_reviews[final_reviews['split'] == 'test'].copy()  # .copy()로 슬라이싱된 데이터를 복사

# 2. 랜덤한 레스토랑 이름 생성
# 각 인덱스에 대해 랜덤한 레스토랑 이름을 할당
test_data['restaurant_name'] = [generate_restaurant_name() for _ in range(len(test_data))]

# 3. 모델에서 영향력 있는 상위 20개 단어 추출
top_20_positive_indices = indices[:20]  # 긍정적인 리뷰에서 영향력 있는 상위 20개 단어 인덱스
top_20_negative_indices = indices[-20:]  # 부정적인 리뷰에서 영향력 있는 하위 20개 단어 인덱스

top_20_positive_words = [vectorizer.review_vocab.lookup_index(idx) for idx in top_20_positive_indices]

# 4. test 데이터에서 상위 20개 단어가 얼마나 자주 등장하는지 세기
restaurant_scores = {}

for idx, row in test_data.iterrows():  # test 데이터에서 각 리뷰 순회
    review = row['review']  # 리뷰 텍스트
    rating = row['rating']  # 레이블 (Positive / Negative)
    restaurant_name = row['restaurant_name']  # 새로 생성된 레스토랑 이름 사용
    restaurant_index = idx  # 원래의 인덱스를 레스토랑 인덱스로 사용

    # 리뷰에서 상위 20개 긍정적인 단어가 얼마나 자주 등장하는지 확인
    score = 0
    review_tokens = review.split()  # 공백을 기준으로 리뷰 단어 분리
    for word in top_20_positive_words:
        score += review_tokens.count(word)  # 상위 20개 단어가 등장할 때마다 점수 추가

    # 레스토랑 또는 리뷰 인덱스를 기준으로 점수 저장
    if rating == 'Positive':  # 긍정적인 리뷰에 대해서만 계산
        restaurant_scores[(restaurant_index, restaurant_name)] = score  # 인덱스와 레스토랑 이름을 키로 사용

# 5. 점수를 기준으로 리뷰를 정렬
sorted_reviews = sorted(restaurant_scores.items(), key=lambda x: x[1], reverse=True)

# 6. 순위대로 출력
print("Reviews with the most influential positive words:")
for rank, ((restaurant_index, restaurant_name), score) in enumerate(sorted_reviews, start=1):
    print(f"Rank {rank}: Restaurant {restaurant_index} '{restaurant_name}' with score {score}")


Reviews with the most influential positive words:
Rank 1: Restaurant 52619 'Faann Garden' with score 10
Rank 2: Restaurant 53838 'Rvcxu Bistro' with score 10
Rank 3: Restaurant 54005 'The Yflhq House' with score 10
Rank 4: Restaurant 52796 'Pdszw Bistro' with score 8
Rank 5: Restaurant 54893 '맛집 55170' with score 8
Rank 6: Restaurant 51858 'Frxye Palace' with score 7
Rank 7: Restaurant 52059 'Xzgbo Garden' with score 7
Rank 8: Restaurant 52073 'Rarsl Garden' with score 7
Rank 9: Restaurant 53588 'Ibiwv Palace' with score 7
Rank 10: Restaurant 54199 'Anrwk Bistro' with score 7
Rank 11: Restaurant 54494 'The Mubow House' with score 7
Rank 12: Restaurant 54812 'King of Xospb' with score 7
Rank 13: Restaurant 55207 'Stvvc Palace' with score 7
Rank 14: Restaurant 55524 'King of Strqr' with score 7
Rank 15: Restaurant 55527 'King of Rjgox' with score 6
Rank 16: Restaurant 55846 'Pzhjt Palace' with score 6
Rank 17: Restaurant 51815 '맛집 84957' with score 5
Rank 18: Restaurant 52008 'Mtncy Bist

In [64]:
final_reviews

Unnamed: 0,rating,review,split
0,Negative,terrible place to work for i just heard a stor...,train
1,Negative,"hours , minutes total time for an extremely s...",train
2,Negative,my less than stellar review is for service . w...,train
3,Negative,i m granting one star because there s no way t...,train
4,Negative,the food here is mediocre at best . i went aft...,train
...,...,...,...
55995,Positive,"great food . wonderful , friendly service . i ...",test
55996,Positive,charlotte should be the new standard for moder...,test
55997,Positive,get the encore sandwich ! ! make sure to get i...,test
55998,Positive,i m a pretty big ice cream gelato fan . pretty...,test


In [70]:
torch.save(classifier.state_dict(), 'review_classifier.pth')

In [71]:
import json

with open("vectorizer.json", "w") as fp:
    json.dump(vectorizer.to_serializable(), fp)