In [67]:
# default
import string
import json
from collections import Counter
from functools import partial


# manipulation
import pandas as pd
import numpy as np

# torch
import torch
from torch.utils.data import (
    Dataset,
    DataLoader
)
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data.backward_compatibility import worker_init_fn
from torchmetrics import Accuracy
# utilities
from tqdm import tqdm

# Vocabulary 

1. Create the dictionary two dictionary that have bijection.
    1. token to idx
    2. idx to token

In [4]:
class Vocabulary:
    def __init__(self,token_to_idx=None,add_unk=True,unk_token="<UNK>") -> None:
        
        # creating the one dict for token to idx
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        # another dict for idx to token
        self._idx_to_token = {idx:token 
                              for token,idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def add_token(self,token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        
        return index
    
    def add_tokens(self,tokens):
        return [self.add_token(token) 
                for token in tokens]
        
    def lookup_token(self,token):
        """
        Return the index of the token
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token,self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self,index):
        if index not in self._idx_to_token:
            raise KeyError(f"the index {index} is not in the Vocabulary.")
        return self._idx_to_token(index)
    
    def __str__(self) -> str:
        return f"<Vocabulary(size={len(self)})>"
    
    def __len__(self):
        return len(self._token_to_idx)     
    
    @classmethod
    def from_serializable(cls,contents):
        return cls(**contents)  
    
    def to_serializable(self):
        return {"token_to_idx":self._token_to_idx,
                "add_unk":self._add_unk,
                "unk_token":self._unk_token}

In [5]:
data = pd.read_csv("../data/reviews_with_splits_lite.csv")

In [6]:
class Vectorizer:
    def __init__(self,review_vocab,rating_vocab) -> None:
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
    
    def vectorize(self,review):
        one_hot = np.zeros(shape=len(self.review_vocab),dtype=np.float32)
        
        for token in review:
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        
        return one_hot
    
    @classmethod 
    def from_dataframe(cls,review_df,cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        #? creating the rating vocab
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
        
        #? word the word in whole rating dataframe
        review_vocab.word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    review_vocab.word_counts[word] += 1
        
        #? creating the token greater than 25
        for word,count in sorted(review_vocab.word_counts.items(), key=lambda x: (-x[1], x[0])):
            if count >= cutoff:
                review_vocab.add_token(word)
                
        return cls(review_vocab,rating_vocab)
    
    @classmethod
    def from_serializable(cls,contents):
        review_vocab = Vocabulary.from_serializable(contents["review_vocab"])
        rating_vocab = Vocabulary.from_serializable(contents["rating_vocab"])
        
        return cls(review_vocab=review_vocab,rating_vocab=rating_vocab)
    
    def to_serializable(self):
        return {"review_vocab":self.review_vocab.to_serializable(),
                "rating_vocab":self.rating_vocab.to_serializable()}   

In [7]:
class ReviewDataset(Dataset):
    
    def __init__(self,review_df,vectorizer) -> None:
        super().__init__()
        
        self.review_df = review_df
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split == "train"]
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split == "val"]
        self.val_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split == "test"]
        self.test_size = len(self.test_df)
        
        self.lookup_dict = {"train":(self.train_df,self.train_size),
                            "val":(self.val_df,self.val_size),
                            "test":(self.test_df,self.test_size)}
        
        self.set_split("train")
    
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        
        review_vector = self._vectorizer.vectorize(row.review)
        rating_vector = self._vectorizer.rating_vocab.lookup_token(row.rating)
        
        return {"x_data":review_vector,
                "y_data":rating_vector} 
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls,review_csv):
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split == "train"]
        return cls(review_df,Vectorizer.from_dataframe(train_review_df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls,review_csv,vectorizer_filepath):
        review_df = pd.read_csv(review_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(review_df,vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return Vectorizer.from_serializable(json.loads(fp))
        
    def save_vectorizer(self,vectorizer_filepath):
        with open(vectorizer_filepath,"w") as fp:
            json.dump(self._vectorizer.to_serializable(json.load(fp)))
            
    def get_vectorizer(self):
        return self._vectorizer
    
    def set_split(self,split="train"):
        self._target_split = split
        self._target_df,self._target_size = self.lookup_dict[split]
    
    def __len__(self):
        return self._target_size
    
    def get_num_batches(self,batch_size):
        return len(self) // batch_size

In [8]:
def generate_batches(dataset,batch_size,shuffle=True,
                     drop_last=True,device="cpu"):
    dataloader = DataLoader(dataset=dataset,batch_size=batch_size,
                            drop_last=drop_last,shuffle=shuffle,
                            worker_init_fn=worker_init_fn,
                            )
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name in data_dict.keys():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
        

In [9]:
dataset  = ReviewDataset.load_dataset_and_make_vectorizer("../data/reviews_with_splits_lite.csv")

In [10]:
sample = iter(dataset)

In [11]:
row = next(sample)
row

{'x_data': array([1., 0., 0., ..., 0., 0., 0.], dtype=float32), 'y_data': 0}

In [12]:
len(dataset.get_vectorizer().review_vocab)

7497

In [13]:
dataset.get_vectorizer().review_vocab.lookup_token("story")

993

In [14]:
dataset.get_vectorizer().review_vocab.lookup_token("biological")

0

In [15]:
dataset.get_vectorizer().review_vocab.word_counts["biological"]

2

In [16]:
dataset.get_vectorizer().review_vocab.word_counts["place"]

21439

# Using the datapipes

In [17]:
from torchdata import datapipes as dp

In [18]:
file_open = dp.iter.FileOpener(["../data/reviews_with_splits_lite.csv"])

In [19]:
parse_csv = file_open.parse_csv(skip_lines=1)

In [20]:
dataloader = DataLoader(parse_csv,batch_size=1)

In [21]:
next(iter(dataloader))

[('negative',),
 ('terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',),
 ('train',)]

In [22]:
def filter_fn(select,row):
    return row[2] == select

In [23]:
train_filter  = parse_csv.filter(partial(filter_fn,"train"))

In [24]:
next(iter(DataLoader(train_filter,batch_size=1)))

[('negative',),
 ('terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',),
 ('train',)]

In [25]:
def get_review(row):
    return row[1]

In [26]:
train_review = train_filter.map(get_review)

In [27]:
next(iter(DataLoader(train_review,batch_size=1)))

['terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it']

In [28]:
def get_review_token(review):
    # print("getting token",review,type(review))
    return [token 
            for token in review.split(" ")
            if token not in string.punctuation]

In [29]:
train_token = train_review.map(get_review_token)

In [30]:
import torchtext

In [31]:
review_vocab = torchtext.vocab.build_vocab_from_iterator(train_token,min_freq=25,specials=["<unk>"])
review_vocab.set_default_index(0)

In [32]:
review_vocab["<unk>"]

0

In [33]:
review_vocab["sgg"]

0

In [34]:
review_vocab["story"]

993

In [35]:
review_vocab["place"]

35

In [36]:
len(review_vocab)

7497

In [37]:
review_vocab.lookup_token(10)

'in'

In [38]:
review_vocab.lookup_indices(["story"])

[993]

In [39]:
def get_rating(row):
    return [row[0]]
train_rating = train_filter.map(get_rating)
next(iter(train_rating))

['negative']

In [40]:
rating_vocab = torchtext.vocab.build_vocab_from_iterator(train_rating)

In [41]:
rating_vocab.get_itos()

['negative', 'positive']

In [42]:
help(review_vocab.lookup_indices)

Help on method lookup_indices in module torchtext.vocab.vocab:

lookup_indices(tokens: List[str]) -> List[int] method of torchtext.vocab.vocab.Vocab instance
    Args:
        tokens: the tokens used to lookup their corresponding `indices`.
    
    Returns:
        The 'indices` associated with `tokens`.



In [43]:
review_vocab.lookup_indices(["biological"])

[0]

In [44]:
len(review_vocab)

7497

In [45]:
np.zeros(len(review_vocab))[review_vocab.lookup_indices(next(iter(train_token)))] = 1

In [46]:
def create_dataset(review_vocab,rating_vocab,row):
    review_vector = np.zeros(len(review_vocab))
    review_vector[review_vocab.lookup_indices((get_review_token(row[1])))] = 1
    
    rating_vector = rating_vocab.lookup_indices([row[0]])[-1]
    
    return {"x_data":review_vector,
            "y_data":rating_vector}

In [47]:
iter_dataset = train_filter.map(partial(create_dataset,review_vocab,rating_vocab))

In [48]:
sample_iter = next(iter(dataset))

In [49]:
rating_vocab.lookup_indices(["negative"])

[0]

In [50]:
all(sample_iter["x_data"] == row["x_data"])

True

In [51]:
next(iter(parse_csv))

['negative',
 'terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',
 'train']

In [52]:
def filter_fn(split,row):
    return row[2] == split

def review_token_fn(row):
    return [token
            for token in row[1].split(" ")
            if token not in string.punctuation]

def rating_token_fn(row):
    return [row[0]]

In [53]:
def get_spilt(csv,split="train"):
    stream = dp.iter.FileOpener([csv])
    row = stream.parse_csv(skip_lines=1)
    return row.filter(partial(filter_fn,split))  

In [54]:
def create_vocab(csv,unk_tkn="<unk>"):
    split = get_spilt(csv,"train")
    
    review_token = split.map(review_token_fn)    
    review_vocab = torchtext.vocab.build_vocab_from_iterator(review_token,
                                                             specials=[unk_tkn],min_freq=25)
    review_vocab.set_default_index(review_vocab[unk_tkn])

    rating_token = split.map(rating_token_fn)
    rating_vocab = torchtext.vocab.build_vocab_from_iterator(rating_token)
    
    return review_vocab ,rating_vocab  

In [55]:
review_vocab ,rating_vocab  = create_vocab("../data/reviews_with_splits_lite.csv")

In [56]:
def create_dataset(review_vocab,rating_vocab,row):
    review_vector = np.zeros(len(review_vocab))
    review_vector[review_vocab.lookup_indices((get_review_token(row[1])))] = 1
    
    rating_vector = rating_vocab.lookup_indices([row[0]])[-1]
    
    return {"x_data":review_vector,
            "y_data":rating_vector}

In [57]:
def build_dataset(csv,split,review_vocab,rating_vocab):
    split_iter = get_spilt(csv,split)
    if split == "train":
        split_iter = split_iter.shuffle()
    return split_iter.map(partial(create_dataset,review_vocab,rating_vocab))
    

In [58]:
CSV_PATH = "../data/reviews_with_splits_lite.csv"
train_dataset = build_dataset(CSV_PATH,"train",review_vocab,rating_vocab)
val_dataset = build_dataset(CSV_PATH,"val",review_vocab,rating_vocab)
test_dataset = build_dataset(CSV_PATH,"test",review_vocab,rating_vocab)

In [73]:
type(train_dataset)

torch.utils.data.datapipes.iter.callable.MapperIterDataPipe

# Classifier

In [59]:
class ReviewClassifier(nn.Module):
    def __init__(self,num_feature) -> None:
        super().__init__()
        self.fc1 = nn.Linear(in_features=num_feature,
                             out_features=1)
        
    def forward(self,input,apply_sigmoid=False):
        y_out = self.fc1(input).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

# Training

## Setting
Hyperparameter and program option are stored.

In [60]:
def set_seed_everywhere(seed,cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

In [61]:
from argparse import Namespace
from pathlib import Path

args = Namespace(
    # data and path information
    frequency_cutoff = 25,
    model_state_file = "model.pth",
    review_csv = "../data/reviews_with_splits_lite.csv",
    save_dir = "../models/yelp/",
    # Model hyperparameters
    # Training Hyperparameters
    batch_size = 128,
    early_stopping_criteria = 5,
    learning_rate = 0.001,
    num_epochs = 100,
    seed = 42,
    # Runtime options
    cuda=True,
    expand_filepaths_to_save_dir = True,
    reload_from_file = False
)

if args.expand_filepaths_to_save_dir:
    if not Path(args.save_dir).exists():
        Path(args.save_dir).mkdir(parents=True,exist_ok=True)
    args.model_state_file = str(Path(args.save_dir)/args.model_state_file)
    
    print("Expand file paths: ")
    print(f"{args.model_state_file}")
    
args.cuda = torch.cuda.is_available()
args.device = torch.device("cuda" if args.cuda else "cpu")
set_seed_everywhere(args.seed,args.cuda)

Expand file paths: 
..\models\yelp\model.pth


## Helper function

In [62]:
def make_train_state(args):
    return {"stop_early":False,
            "early_stopping_step":0,
            "early_stopping_best_val":1e8,
            "learning_rate":args.learning_rate,
            "epoch_index":0,
            "train_loss":[],
            "train_acc":[],
            "val_loss":[],
            "val_acc":[],
            "test_loss":[],
            "test_acc":[],
            "model_filename":args.model_state_file}

In [70]:
def update_train_state(args,model,train_state):
    
    # save one model at least
    if train_state["epoch_index"] == 0:
        torch.save(model.state_dict(),train_state["model_filename"])
        train_state["stop_early"] = False
        
    # save the model if performed
    elif train_state["epoch_index"] >= 0:
        loss_tm1 , loss_t = train_state["train_loss"][-2:]
        
        # if model get worsen
        if loss_t >= train_state["early_stopping_best_val"]:
            # update the early stopping step
            train_state["early_stopping_step"] += 1
        
        # loss decreased that is model is learning
        else:
            # save the model
            if loss_t < train_state["early_stopping_best_val"]:
                torch.save(model.state_dict(),train_state["model_filename"])
                
            # reset early stopping step
            train_state["early_stopping_step"] = 0
        
        #stop early
        train_state["stop_early"] = train_state["early_stopping_step"] >= args.early_stopping_criteria
    
    return train_state

## Training Loop

In [71]:
classifier = ReviewClassifier(num_feature=len(review_vocab)).to(args.device)

loss_fn = nn.BCEWithLogitsLoss()
acc_fn = Accuracy(task="binary",num_classes=2)
optimizer = optim.Adam(params=classifier.parameters(),
                       lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode="min",
                                                 factor=0.5,
                                                 patience=1)
train_state = make_train_state(args)

In [72]:
for epoch_index in tqdm(range(args.num_epochs)):
    train_state["epoch_index"] = epoch_index
    
    batch_generator = generate_batches(dataset=train_dataset,batch_size=args.batch_size,
                                       device=args.device)
    
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    
    for batch_idx,batch_dict in enumerate(batch_generator):
        # zero grad
        optimizer.zero_grad()
        
        # forward pass
        y_pred = classifier(batch_dict["x_data"].float())
        
        # compute the loss
        loss = loss_fn(y_pred,batch_dict["y_data"].float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_idx+1)
        
        # back loss
        loss.backward()
        
        # use optimizer to take gradient step
        optimizer.step()
        
        # compute the accuracy
        acc_t = acc_fn(y_pred,batch_dict["y_data"]).item()
        running_acc += (acc_t - running_acc) / (batch_idx + 1)
        
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    
    
    # iter over val dataset
    batch_generator = generate_batches(dataset=val_dataset,
                                       batch_size=args.batch_size,
                                       device=args.device)
    
    running_acc = 0.0
    running_loss = 0.0
    classifier.eval()
    
    for batch_idx,batch_dict in enumerate(batch_generator):
        with torch.inference_mode():
            y_pred = classifier(batch_dict["x_data"].float())
            
            loss = loss_fn(y_pred,batch_dict["y_data"].float())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_idx +1)
            
            # compute teh accuracy
            acc_t = acc_fn(y_pred,batch_dict["y_data"].float()).item()
            running_acc += (acc_t - running_acc) /(batch_idx + 1)
            
    train_state["val_loss"].append(running_loss)
    train_state["val_acc"].append(running_acc)
    
    train_state = update_train_state(args=args,model=classifier,train_state=train_state)
    scheduler.step(train_state["val_loss"][-1])
            
    if train_state["stop_early"]:
        break 
                          

  5%|▌         | 5/100 [02:00<38:14, 24.15s/it]


KeyboardInterrupt: 