### IMPORTING THE MODULES

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

#nltk
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# BeautifulSoup libraray
from bs4 import BeautifulSoup 

import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report

#preprocessing scikit
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
 
#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence

#gensim w2v
#word2vec
from gensim.models import Word2Vec

### LOADING THE DATASET

In [2]:
#rev_frame=pd.read_csv(r'../input/Reviews.csv')
df = pd.read_csv('Taxml_Integers_Verified.csv', index_col=None, header=0,dtype={'item_name': str, 'description': str,'establishment_type': str, 'CAT_Name':str},usecols=['item_name','description','establishment_type','CAT_Name'])
#df_duplicates = df_integers[df_integers.duplicated(subset=['item_name','description','establishment_type'],keep=False)]


In [8]:
new_stopwords=['grocery']
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(new_stopwords)
le=WordNetLemmatizer()
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stpwrd]
    lemma_words=[le.lemmatize(w) for w in filtered_words]
    final_text=" ".join([w for w in lemma_words])
    return final_text
def cat_name_sorted(text):
    text=sorted(text.split(','))
    text=','.join([x for x in text])
    return text
def cat_name_without_temp(text):
    if ',TEMP_HEATED' in text:
        text=re.sub(',TEMP_HEATED','', text)
    if ',TEMP_UNHEATED' in text:
        text=re.sub(',TEMP_UNHEATED','', text)
    if ',TEMP_COLD' in text:
        text=re.sub(',TEMP_COLD','', text)
    if 'TEMP_HEATED' in text:
        text=re.sub('TEMP_HEATED','', text)
    if 'TEMP_UNHEATED' in text:
        text=re.sub('TEMP_UNHEATED','', text)
    if 'TEMP_COLD' in text:
        text=re.sub('TEMP_COLD','', text)
    else:
        pass
    return text



In [9]:
df1=df.sample(n=2000,random_state=42)
#df1=df.sample(frac=1, random_state=42)
df1['CAT_Name']=df1['CAT_Name'].progress_apply(lambda x: cat_name_sorted(x))
df1['CAT_Name']=df1['CAT_Name'].progress_apply(lambda x: cat_name_without_temp(x))
df1=df1.replace(r'^\s*$', np.nan, regex=True)
df1 = df1.dropna(subset=['CAT_Name'])
df1=df1.drop_duplicates(subset=['item_name','description','establishment_type','CAT_Name'],ignore_index=True,keep=False)
df1['input_str'] = df1[['item_name', 'description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
df1['cleanText']=df1['input_str'].map(lambda s:preprocess(s)) 
df1=df1.drop_duplicates(subset=['CAT_Name','cleanText'],ignore_index=True,keep='first')
df1 = df1.reset_index(drop=True)

Unnamed: 0,item_name,description,establishment_type,CAT_Name,input_str
0,Tokuyo Tororo Kombu,1.1 oz. 33 grams. 1.2 ounces.,GROCERY,CAT_PREPACKAGED_FOOD,tokuyo tororo kombu oz grams ounces grocery
1,Rainbow Ice,\N,GROCERY,CAT_CANDY,rainbow ice grocery
2,Master of Mixes Simple Syrup 375 ml,An essential ingredient in many of the most cl...,GROCERY,CAT_PREPACKAGED_FOOD,master mies simple syrup ml essential ingredie...
3,Bagel with Butter Breakfast,,SPECIALITY_STORE,CAT_PREPARED_FOOD,bagel butter breakfast speciality_store
4,"4 Copas Tequila Anejo, 750 ml (40% ABV)",\N,GROCERY,CAT_LIQUOR,copas tequila anejo ml abv grocery
...,...,...,...,...,...
9535,Peach Bellini®,These all-natural gummies are filled with juic...,SPECIALITY_STORE,CAT_CONFECTIONARY,peach bellini allnatural gummies filled juicy ...
9536,E&J BRANDY GRAND BLUE VSOP,750 ML,GROCERY,CAT_LIQUOR,ej brandy grand blue vsop ml grocery
9537,"Coors Light, 18pk-12oz can beer (4.2% ABV)\n",,GROCERY,CAT_BEER,coors light pkoz beer abv grocery
9538,Patron Anejo Tequila 375mL(40.0%ABV),"A distinctly barrel-aged spirit, Patrón Añejo ...",GROCERY,CAT_LIQUOR,patron anejo tequila ml abv distinctly barrela...


In [None]:
df1['cat_count'] = df1.groupby('CAT_Name')['CAT_Name'].transform('count')
df2=df1[df1['cat_count']<2].reset_index()
df3=df1[df1['cat_count']>2].reset_index()
X=df3[['item_name','description','establishment_type','cleanText']]
Y=df3['CAT_Name']

In [None]:
train_X, test_X, y_train, y_val = train_test_split(df1.index.values, 
                                                  df1.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42
                                                  )

#### Let us now see if any of the column has any null values.

In [None]:
from keras.preprocessing.text import Tokenizer

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X)+list(test_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(train_y.values)
test_y = le.transform(test_y.values)

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/glove840b300dtxt/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_matrix = load_glove(tokenizer.word_index)

In [None]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.hidden_size = 64
        drp = 0.1
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drp)
        self.out = nn.Linear(64, n_classes)


    def forward(self, x):
        *#rint(x.size())*
        h_embedding = self.embedding(x)
        *#_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))*
        h_lstm, _ = self.lstm(h_embedding)
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [None]:
n_epochs = 6
model = BiLSTM()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
model.cuda()

# Load train and test in CUDA Memory
x_train = torch.tensor(train_X, dtype=torch.long).cuda()
y_train = torch.tensor(train_y, dtype=torch.long).cuda()
x_cv = torch.tensor(test_X, dtype=torch.long).cuda()
y_cv = torch.tensor(test_y, dtype=torch.long).cuda()

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    start_time = time.time()
    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)

    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_cv),len(le.classes_)))

    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()

    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))
