In [1]:
# default
import string
import json
from collections import Counter
from functools import partial


# manipulation
import pandas as pd
import numpy as np

# torch
from torch.utils.data import (
    Dataset,
    DataLoader
)
from torch.utils.data.backward_compatibility import worker_init_fn

# Vocabulary 

1. Create the dictionary two dictionary that have bijection.
    1. token to idx
    2. idx to token

In [2]:
class Vocabulary:
    def __init__(self,token_to_idx=None,add_unk=True,unk_token="<UNK>") -> None:
        
        # creating the one dict for token to idx
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        # another dict for idx to token
        self._idx_to_token = {idx:token 
                              for token,idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def add_token(self,token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        
        return index
    
    def add_tokens(self,tokens):
        return [self.add_token(token) 
                for token in tokens]
        
    def lookup_token(self,token):
        """
        Return the index of the token
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token,self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self,index):
        if index not in self._idx_to_token:
            raise KeyError(f"the index {index} is not in the Vocabulary.")
        return self._idx_to_token(index)
    
    def __str__(self) -> str:
        return f"<Vocabulary(size={len(self)})>"
    
    def __len__(self):
        return len(self._token_to_idx)     
    
    @classmethod
    def from_serializable(cls,contents):
        return cls(**contents)  
    
    def to_serializable(self):
        return {"token_to_idx":self._token_to_idx,
                "add_unk":self._add_unk,
                "unk_token":self._unk_token}

In [3]:
data = pd.read_csv("../data/reviews_with_splits_lite.csv")

In [4]:
class Vectorizer:
    def __init__(self,review_vocab,rating_vocab) -> None:
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
    
    def vectorize(self,review):
        one_hot = np.zeros(shape=len(self.review_vocab),dtype=np.float32)
        
        for token in review:
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        
        return one_hot
    
    @classmethod 
    def from_dataframe(cls,review_df,cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        #? creating the rating vocab
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
        
        #? word the word in whole rating dataframe
        review_vocab.word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    review_vocab.word_counts[word] += 1
        
        #? creating the token greater than 25
        for word,count in sorted(review_vocab.word_counts.items(), key=lambda x: (-x[1], x[0])):
            if count >= cutoff:
                review_vocab.add_token(word)
                
        return cls(review_vocab,rating_vocab)
    
    @classmethod
    def from_serializable(cls,contents):
        review_vocab = Vocabulary.from_serializable(contents["review_vocab"])
        rating_vocab = Vocabulary.from_serializable(contents["rating_vocab"])
        
        return cls(review_vocab=review_vocab,rating_vocab=rating_vocab)
    
    def to_serializable(self):
        return {"review_vocab":self.review_vocab.to_serializable(),
                "rating_vocab":self.rating_vocab.to_serializable()}   

In [5]:
class ReviewDataset(Dataset):
    
    def __init__(self,review_df,vectorizer) -> None:
        super().__init__()
        
        self.review_df = review_df
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split == "train"]
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split == "val"]
        self.val_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split == "test"]
        self.test_size = len(self.test_df)
        
        self.lookup_dict = {"train":(self.train_df,self.train_size),
                            "val":(self.val_df,self.val_size),
                            "test":(self.test_df,self.test_size)}
        
        self.set_split("train")
    
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        
        review_vector = self._vectorizer.vectorize(row.review)
        rating_vector = self._vectorizer.rating_vocab.lookup_token(row.rating)
        
        return {"x_data":review_vector,
                "y_data":rating_vector} 
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls,review_csv):
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split == "train"]
        return cls(review_df,Vectorizer.from_dataframe(train_review_df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls,review_csv,vectorizer_filepath):
        review_df = pd.read_csv(review_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(review_df,vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return Vectorizer.from_serializable(json.loads(fp))
        
    def save_vectorizer(self,vectorizer_filepath):
        with open(vectorizer_filepath,"w") as fp:
            json.dump(self._vectorizer.to_serializable(json.load(fp)))
            
    def get_vectorizer(self):
        return self._vectorizer
    
    def set_split(self,split="train"):
        self._target_split = split
        self._target_df,self._target_size = self.lookup_dict[split]
    
    def __len__(self):
        return self._target_size
    
    def get_num_batches(self,batch_size):
        return len(self) // batch_size

In [6]:
def generate_batches(dataset,batch_size,shuffle=True,
                     drop_last=True,device="cpu"):
    dataloader = DataLoader(dataset=dataset,batch_size=batch_size,
                            drop_last=drop_last,shuffle=shuffle,
                            worker_init_fn=worker_init_fn)
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name in data_dict.keys():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict
        

In [7]:
dataset  = ReviewDataset.load_dataset_and_make_vectorizer("../data/reviews_with_splits_lite.csv")

In [8]:
sample = iter(dataset)

In [9]:
row = next(sample)
row

{'x_data': array([1., 0., 0., ..., 0., 0., 0.], dtype=float32), 'y_data': 0}

In [10]:
len(dataset.get_vectorizer().review_vocab)

7497

In [11]:
dataset.get_vectorizer().review_vocab.lookup_token("story")

993

In [12]:
dataset.get_vectorizer().review_vocab.lookup_token("biological")

0

In [13]:
dataset.get_vectorizer().review_vocab.word_counts["biological"]

2

In [14]:
dataset.get_vectorizer().review_vocab.word_counts["place"]

21439

# Using the datapipes

In [15]:
from torchdata import datapipes as dp

In [16]:
file_open = dp.iter.FileOpener(["../data/reviews_with_splits_lite.csv"])

In [17]:
parse_csv = file_open.parse_csv(skip_lines=1)

In [18]:
dataloader = DataLoader(parse_csv,batch_size=1)

In [19]:
next(iter(dataloader))

[('negative',),
 ('terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',),
 ('train',)]

In [20]:
def filter_fn(select,row):
    return row[2] == select

In [21]:
train_filter  = parse_csv.filter(partial(filter_fn,"train"))

In [22]:
next(iter(DataLoader(train_filter,batch_size=1)))

[('negative',),
 ('terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',),
 ('train',)]

In [23]:
def get_review(row):
    return row[1]

In [24]:
train_review = train_filter.map(get_review)

In [25]:
next(iter(DataLoader(train_review,batch_size=1)))

['terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it']

In [26]:
def get_review_token(review):
    # print("getting token",review,type(review))
    return [token 
            for token in review.split(" ")
            if token not in string.punctuation]

In [27]:
train_token = train_review.map(get_review_token)

In [28]:
import torchtext

In [29]:
review_vocab = torchtext.vocab.build_vocab_from_iterator(train_token,min_freq=25,specials=["<unk>"])
review_vocab.set_default_index(0)

In [30]:
review_vocab["<unk>"]

0

In [31]:
review_vocab["sgg"]

0

In [32]:
review_vocab["story"]

993

In [33]:
review_vocab["place"]

35

In [34]:
len(review_vocab)

7497

In [35]:
review_vocab.lookup_token(10)

'in'

In [36]:
review_vocab.lookup_indices(["story"])

[993]

In [37]:
def get_rating(row):
    return [row[0]]
train_rating = train_filter.map(get_rating)
next(iter(train_rating))

['negative']

In [38]:
rating_vocab = torchtext.vocab.build_vocab_from_iterator(train_rating)

In [39]:
rating_vocab.get_itos()

['negative', 'positive']

In [40]:
help(review_vocab.lookup_indices)

Help on method lookup_indices in module torchtext.vocab.vocab:

lookup_indices(tokens: List[str]) -> List[int] method of torchtext.vocab.vocab.Vocab instance
    Args:
        tokens: the tokens used to lookup their corresponding `indices`.
    
    Returns:
        The 'indices` associated with `tokens`.



In [41]:
review_vocab.lookup_indices(["biological"])

[0]

In [42]:
len(review_vocab)

7497

In [43]:
np.zeros(len(review_vocab))[review_vocab.lookup_indices(next(iter(train_token)))] = 1

In [44]:
def create_dataset(review_vocab,rating_vocab,row):
    review_vector = np.zeros(len(review_vocab))
    review_vector[review_vocab.lookup_indices((get_review_token(row[1])))] = 1
    
    rating_vector = rating_vocab.lookup_indices([row[0]])[-1]
    
    return {"x_data":review_vector,
            "y_data":rating_vector}

In [45]:
iter_dataset = train_filter.map(partial(create_dataset,review_vocab,rating_vocab))

In [46]:
sample_iter = next(iter(dataset))

In [47]:
rating_vocab.lookup_indices(["negative"])

[0]

In [48]:
all(sample_iter["x_data"] == row["x_data"])

True

In [49]:
next(iter(parse_csv))

['negative',
 'terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',
 'train']

In [50]:
def filter_fn(split,row):
    return row[2] == split

def review_token_fn(row):
    return [token
            for token in row[1].split(" ")
            if token not in string.punctuation]

def rating_token_fn(row):
    return [row[0]]

In [51]:
def get_spilt(csv,split="train"):
    stream = dp.iter.FileOpener([csv])
    row = stream.parse_csv(skip_lines=1)
    return row.filter(partial(filter_fn,split))
    

In [52]:
def create_vocab(csv,unk_tkn="<unk>"):
    split = get_spilt(csv,"train")
    
    review_token = split.map(review_token_fn)    
    review_vocab = torchtext.vocab.build_vocab_from_iterator(review_token,
                                                             specials=[unk_tkn],min_freq=25)
    review_vocab.set_default_index(review_vocab[unk_tkn])

    rating_token = split.map(rating_token_fn)
    rating_vocab = torchtext.vocab.build_vocab_from_iterator(rating_token)
    
    return review_vocab ,rating_vocab  

In [53]:
review_vocab ,rating_vocab  = create_vocab("../data/reviews_with_splits_lite.csv")

In [54]:
def create_dataset(review_vocab,rating_vocab,row):
    review_vector = np.zeros(len(review_vocab))
    review_vector[review_vocab.lookup_indices((get_review_token(row[1])))] = 1
    
    rating_vector = rating_vocab.lookup_indices([row[0]])[-1]
    
    return {"x_data":review_vector,
            "y_data":rating_vector}

In [55]:
def build_dataset(csv,split,review_vocab,rating_vocab):
    split_iter = get_spilt(csv,split)
    if split == "train":
        split_iter = split_iter.shuffle()
    return split_iter.map(partial(create_dataset,review_vocab,rating_vocab))
    

In [56]:
CSV_PATH = "../data/reviews_with_splits_lite.csv"
train_dataset = build_dataset(CSV_PATH,"train",review_vocab,rating_vocab)
val_dataset = build_dataset(CSV_PATH,"val",review_vocab,rating_vocab)
test_dataset = build_dataset(CSV_PATH,"test",review_vocab,rating_vocab)