In [1]:
import sys
import re
import os
from pathlib import Path
from collections import namedtuple
import numpy as np

### Taking data set from Seeval 2016 - task 5 subset 1 : http://alt.qcri.org/semeval2016/task5/

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
from collections import Counter, defaultdict
import re
import ast
pd.set_option('display.max_colwidth' , -1)

### English - Restaurant domain training data

In [3]:
eng_multi_aspects = pd.read_csv('../data/English_restaurants.csv')
eng_multi_aspects['aspects'] = eng_multi_aspects['aspects'].apply(lambda x: ast.literal_eval(x))
eng_multi_aspects['polarities'] = eng_multi_aspects['polarities'].apply(lambda x: ast.literal_eval(x))
eng_multi_aspects.head(2)

Unnamed: 0,aspects,polarities,text
0,[RESTAURANT#GENERAL],[negative],"Judging from previous posts this used to be a good place, but not any longer."
1,[SERVICE#GENERAL],[negative],"We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude."


In [4]:
eng_multi_aspects.aspects.apply(pd.Series).merge(eng_multi_aspects , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities'] ,axis = 1).melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna().value.value_counts(normalize = True)

FOOD#QUALITY                0.338652
SERVICE#GENERAL             0.179099
RESTAURANT#GENERAL          0.168329
AMBIENCE#GENERAL            0.101715
FOOD#STYLE_OPTIONS          0.054647
RESTAURANT#MISCELLANEOUS    0.039091
FOOD#PRICES                 0.035899
RESTAURANT#PRICES           0.031911
DRINKS#QUALITY              0.018748
DRINKS#STYLE_OPTIONS        0.012764
LOCATION#GENERAL            0.011169
DRINKS#PRICES               0.007978
Name: value, dtype: float64

### Dutch  - Restaurant domain training data

In [4]:
du_multi_aspects = pd.read_csv('../data/Dutch_restaurants.csv')
du_multi_aspects['aspects'] = du_multi_aspects['aspects'].apply(lambda x: ast.literal_eval(x))
du_multi_aspects['polarities'] = du_multi_aspects['polarities'].apply(lambda x: ast.literal_eval(x))
du_multi_aspects.head(2)

Unnamed: 0,aspects,polarities,text
0,[SERVICE#GENERAL],[negative],Lange wachttijd.
1,[FOOD#STYLE_OPTIONS],[negative],"Zelfde dessert, 2 dagen na mekaar."


### Spanish Restaurant domain training data

In [5]:
spanish_multi_aspects = pd.read_csv('../data/Spanish_restaurants.csv')
spanish_multi_aspects['aspects'] = spanish_multi_aspects['aspects'].apply(lambda x: ast.literal_eval(x))
spanish_multi_aspects['polarities'] = spanish_multi_aspects['polarities'].apply(lambda x: ast.literal_eval(x))
spanish_multi_aspects.head(2)

Unnamed: 0,aspects,polarities,text
0,[RESTAURANT#GENERAL],[positive],Nos sentimos muy a gusto.
1,"[SERVICE#GENERAL, AMBIENCE#GENERAL, FOOD#QUALITY]","[positive, positive, positive]","Buen servicio, ambiente Acogedor y tranquilo, comida bien."


In [6]:
def extract_aspects(lst_aspects):
    res = []
    for x in lst_aspects:
        entity = x.split('#')[0] ; attribute = x.split('#')[1] ; res.append(entity)       
    return res
eng_multi_aspects['aspects2'] = eng_multi_aspects['aspects'].apply(lambda x: extract_aspects(x))
eng_multi_aspects['aspects2'] = eng_multi_aspects['aspects2'].apply(lambda x : list(set(x)))

du_multi_aspects['aspects2'] = du_multi_aspects['aspects'].apply(lambda x: extract_aspects(x))
du_multi_aspects['aspects2'] = du_multi_aspects['aspects2'].apply(lambda x : list(set(x)))

spanish_multi_aspects['aspects2'] = spanish_multi_aspects['aspects'].apply(lambda x: extract_aspects(x))
spanish_multi_aspects['aspects2'] = spanish_multi_aspects['aspects2'].apply(lambda x : list(set(x)))

In [426]:
eng_multi_aspects[eng_multi_aspects['aspects2'].apply(lambda x: ' '.join(x)).str.contains('AMBIENCE')].head()

Unnamed: 0,text,aspects2,polarities2
21,"Everything is always cooked to perfection, the service is excellent, the decor cool and understated.","[SERVICE, AMBIENCE, FOOD]","[positive, positive, positive]"
28,"The decor is night tho...but they REALLY need to clean that vent in the ceiling...its quite un-appetizing, and kills your effort to make this place look sleek and modern.",[AMBIENCE],[negative]
36,Ambiance- relaxed and stylish.,[AMBIENCE],[positive]
58,Quite simply it's like stepping out of Manhattan and into a haven of tranquility.,[AMBIENCE],[positive]
70,"The ambience is pretty and nice for conversation, so a casual lunch here would probably be best.","[RESTAURANT, AMBIENCE]","[positive, positive]"


In [427]:
spanish_multi_aspects[spanish_multi_aspects['aspects2'].apply(lambda x: ' '.join(x)).str.contains('AMBIENCE')].head()

Unnamed: 0,text,aspects2,polarities2,aspects_pred
1,"Buen servicio, ambiente Acogedor y tranquilo, comida bien.","[SERVICE, AMBIENCE, FOOD]","[positive, positive, positive]","[AMBIENCE, SERVICE]"
10,"la calidad del producto, el servicio, el entorno todo fue excelente","[SERVICE, AMBIENCE, FOOD]","[positive, positive, positive]","[FOOD, SERVICE]"
15,"Fabuloso, muy atentos la comida excelente y un ambiente estupendo.","[SERVICE, RESTAURANT, AMBIENCE, FOOD]","[positive, positive, positive, positive]","[AMBIENCE, FOOD]"
18,"El restaurante es precioso, moderno y acogedor.",[AMBIENCE],[positive],[AMBIENCE]
26,"La verdad es que todo muy bien; el servicio, la comida y la apariencia, todo correcto.","[SERVICE, RESTAURANT, AMBIENCE, FOOD]","[positive, positive, positive, positive]","[FOOD, SERVICE]"


In [7]:
eng_multi_aspects.aspects2.apply(pd.Series).merge(eng_multi_aspects , right_index = True , left_index = True)\
.drop(['aspects' , 'polarities', 'aspects2'] ,axis = 1).melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna().value.value_counts(normalize = True).reset_index()[['index']]\
.to_csv('../data/apsect_names.txt' , header = None , index = None , mode = 'w')

### Extract aspect embeddings

In [9]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/")
CACHE_PATH = Path("cache/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

In [10]:
from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, IndexSearchKNN, IndexCreate, IndexSearchMultiple
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

#### Following steps from https://medium.com/the-artificial-impostor/multilingual-similarity-search-using-pretrained-bidirectional-lstm-encoder-e34fac5958b0 for tokenization , BPE Fast and Embedding extractions 

In [11]:
encoder = SentenceEncoder(
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)
bpe_codes = str(MODEL_PATH / "93langs.fcodes")

for lang in ("en" ,"nl", 'es'):  ##"zh" for chinese , nl  for dutch and es for spanish
    Token(  #../data/apsect_names.txt
        str(DATA_PATH / f"apsect_names.txt"), ##english_resturant.txt
        str(CACHE_PATH / f"apsect_names.txt"),
        lang=lang,
        romanize=False,
        lower_case=True, gzip=False,
        verbose=True)
    BPEfastApply(
        str(CACHE_PATH / f"apsect_names.txt"),
        str(CACHE_PATH / f"apsect_names.bpe"),
        bpe_codes,
        verbose=True, over_write=True)
    EncodeFile(
        encoder,
        str(CACHE_PATH / f"apsect_names.bpe"),
        str(CACHE_PATH / f"apsect_names.enc"),
        verbose=True, over_write=True)    

 - Tokenizer: apsect_names.txt exists already
 - Tokenizer: apsect_names.txt exists already
 - Tokenizer: apsect_names.txt exists already


In [12]:
data_aspect, index_aspect = IndexCreate(
     str(CACHE_PATH / f"apsect_names.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache/apsect_names.enc 6 examples of dim 1024
 - creating FAISS index


In [13]:
data_aspect.shape

(6, 1024)

## Extract Setence Embeddings

In [11]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/tatoeba/v1/")
CACHE_PATH = Path("cache/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

data_en, index_en = IndexCreate(
    str(CACHE_PATH / "en_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
data_du, index_du = IndexCreate(
    str(CACHE_PATH / "nl_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
data_spanish, index_spanish = IndexCreate(
    str(CACHE_PATH / "es_resturant.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache/en_resturant.enc 1708 examples of dim 1024
 - creating FAISS index
 - embedding: cache/nl_resturant.enc 1317 examples of dim 1024
 - creating FAISS index
 - embedding: cache/es_resturant.enc 1626 examples of dim 1024
 - creating FAISS index


##### Because dataset of semeval is not exact translation of each other , some of the above results are not good. 

### Creating multi label classification task using LASER sentence embedding. 
We can have 6 aspect categories , present for each review. We will train a simple 1 layer Neural Network model using 1024 dimensional sentence embedding as input and 6 categories as output.  
Train the model on 1700 English sentences and Validate on 1300 Dutch sentences . We are getting around 85 % accuracy and 57% f1 score(Macro) 

In [12]:
eng_multi_aspects.head(2)

Unnamed: 0,aspects,polarities,text,aspects2
0,[RESTAURANT#GENERAL],[negative],"Judging from previous posts this used to be a good place, but not any longer.",[RESTAURANT]
1,[SERVICE#GENERAL],[negative],"We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.",[SERVICE]


In [13]:
eng_multi_aspects.drop(columns=[ 'aspects'] , inplace=True)
du_multi_aspects.drop(columns=[ 'aspects'] , inplace=True)
spanish_multi_aspects.drop(columns=['aspects'] , inplace=True)

In [16]:

eng_multi_aspects['len_aspects'] = eng_multi_aspects['aspects2'].apply(lambda x: len(x))
eng_multi_aspects['polarities2'] = eng_multi_aspects.apply(lambda x: x['polarities'][0 : int(x['len_aspects'])]  , axis = 1)

du_multi_aspects['len_aspects'] = du_multi_aspects['aspects2'].apply(lambda x: len(x))
du_multi_aspects['polarities2'] = du_multi_aspects.apply(lambda x: x['polarities'][0 : int(x['len_aspects'])]  , axis = 1)

spanish_multi_aspects['len_aspects'] = spanish_multi_aspects['aspects2'].apply(lambda x: len(x))
spanish_multi_aspects['polarities2'] = spanish_multi_aspects.apply(lambda x: x['polarities'][0 : int(x['len_aspects'])]  , axis = 1)

eng_multi_aspects.drop(columns=[  'polarities' , 'len_aspects' ] , inplace=True)
du_multi_aspects.drop(columns=[ 'polarities' , 'len_aspects'] , inplace=True)
spanish_multi_aspects.drop(columns=[  'polarities' , 'len_aspects'] , inplace=True)

eng_multi_aspects.head(2)

Unnamed: 0,text,aspects2,polarities2
0,"Judging from previous posts this used to be a good place, but not any longer.",[RESTAURANT],[negative]
1,"We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.",[SERVICE],[negative]


In [17]:
from sklearn.model_selection import train_test_split
train_aspects , val_aspects, train_df , val_df = train_test_split(eng_multi_aspects, data_en , test_size = 0.2 , random_state = 42)

In [21]:
train_aspects.head(2)

Unnamed: 0,text,aspects2,polarities2
590,"The food was very good, a great deal, and the place its self was great.","[AMBIENCE, FOOD]","[positive, positive]"
1551,Terrible would be a compliment!,[RESTAURANT],[negative]


In [23]:
val_aspects.head()

Unnamed: 0,text,aspects2,polarities2
567,The lobster sandwich is $24 and although it was good it was not nearly enough to warrant that price.,[FOOD],[positive]
1325,"I go out to eat and like my courses, servers are patient and never rush courses or force another drink.",[SERVICE],[positive]


In [24]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()
tr_eng = mlb.fit_transform(train_aspects.aspects2)
val_eng = mlb.transform(val_aspects.aspects2)
y_du  = mlb.transform(du_multi_aspects.aspects2)
y_spainish  = mlb.transform(spanish_multi_aspects.aspects2)

In [25]:
train_aspects.reset_index(inplace=True , drop= True)
train_fn = pd.merge(train_aspects , pd.DataFrame(tr_eng , columns=mlb.classes_) , left_index=True , right_index=True)

val_aspects.reset_index(inplace=True , drop= True)
val_fn = pd.merge(val_aspects , pd.DataFrame(val_eng , columns=mlb.classes_) , left_index=True , right_index=True)

#train_fn.drop(columns=['aspects2' ] , inplace=True)

# train_fn.to_csv('resturant_train_eng.csv' , index = False)
# val_fn.to_csv('resturant_val_eng.csv' , index = False)

In [27]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler().fit(train_df)
train_std = std_scale.transform(train_df) 
val_std = std_scale.transform(val_df)
dutch_std = std_scale.transform(data_du)
spanish_std = std_scale.transform(data_spanish)

In [28]:
train_std.shape , val_std.shape , dutch_std.shape , spanish_std.shape

((1366, 1024), (342, 1024), (1317, 1024), (1626, 1024))

In [29]:
import torch 
import torch
import torch.nn as nn

x_train,y_train,x_valid,y_valid , x_test , y_test  , x_test_sp , y_test_sp = map(torch.FloatTensor, (train_std,tr_eng,  val_std ,\
                                                                            val_eng, dutch_std,y_du, \
                                                                           spanish_std ,y_spainish ))
n,c = x_train.shape
y_train = y_train.type(torch.FloatTensor)
y_valid = y_valid.type(torch.FloatTensor)
y_test = y_test.type(torch.FloatTensor)
y_test_sp = y_test_sp.type(torch.FloatTensor)

print(y_train.shape , y_valid.shape , y_test.shape)
print(x_train.shape , x_valid.shape , y_test.shape)
batch_size = 64

torch.Size([1366, 6]) torch.Size([342, 6]) torch.Size([1317, 6])
torch.Size([1366, 1024]) torch.Size([342, 1024]) torch.Size([1317, 6])


In [30]:
# model = torch.nn.Sequential(
#     torch.nn.Linear(1024, 512),
#     torch.nn.Dropout(0.25),  # drop 10% of the neuron
#     torch.nn.ReLU(),
#     torch.nn.Linear(512, 384),
#     torch.nn.Dropout(0.25),  # drop 10% of the neuron
#     torch.nn.ReLU(),
#     torch.nn.Linear(384, 6),
# )

# print(model)

class Model(nn.Module):
    def __init__(self , p):
        super().__init__()
        self.hidden = nn.Linear(1024, 512)
        self.hidden2 = nn.Linear(512 , 256)
        self.hidden3 =  nn.Linear(256 , 128)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.fc = nn.Linear(128, 6)

    def forward(self, x):
        x = self.activation(self.dropout(self.hidden(x)))
        x = self.activation(self.dropout(self.hidden2(x)))
        x = self.activation(self.dropout(self.hidden3(x)))
        x = self.fc(x)
        return x

In [31]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size , shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid )
valid_dl = DataLoader(valid_ds , batch_size= batch_size)

test_ds = TensorDataset(x_test , y_test)
test_dl = DataLoader(test_ds , batch_size=batch_size)

test_ds2 = TensorDataset(x_test_sp , y_test_sp)
test_dl2 = DataLoader(test_ds2 , batch_size=batch_size)

In [32]:
class WrappedDataLoader():
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func
        
    def __len__(self): return len(self.dl)
    
    def __iter__(self):
        batches = iter(self.dl)
        for b in batches: yield(self.func(*b))

In [33]:
dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def preprocess(x,y): return x.to(dev),y.to(dev)

train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)
test_dl = WrappedDataLoader(test_dl , preprocess)
test_dl2 = WrappedDataLoader(test_dl2 , preprocess)

In [36]:
train_aspects.columns

Index(['text', 'aspects2', 'polarities2'], dtype='object')

In [37]:
df_data_ratio = train_aspects.aspects2.apply(pd.Series).merge(train_aspects , right_index = True , left_index = True)\
.drop([ 'aspects2' , 'polarities2'] ,axis = 1).melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna().value.value_counts(normalize = True).reset_index()

In [38]:
df_data_ratio

Unnamed: 0,index,value
0,FOOD,0.361111
1,RESTAURANT,0.270531
2,SERVICE,0.198671
3,AMBIENCE,0.117754
4,DRINKS,0.038043
5,LOCATION,0.013889


In [39]:

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.mean() #/ (len(correct))
    return acc


def fbeta_score(y_true, y_pred, beta, threshold, eps=1e-9):
    beta2 = beta**2

    y_pred = torch.ge(torch.sigmoid(y_pred).float(), threshold).float()
    y_true = y_true.float()

    true_positive = (y_pred * y_true).sum(dim=0)
    precision = true_positive.div(y_pred.sum(dim=0).add(eps))
    recall = true_positive.div(y_true.sum(dim=0).add(eps))
    
    return torch.mean(
        (precision*recall).
        div(precision.mul(beta2) + recall + eps).
        mul(1 + beta2)) , torch.mean(precision) , torch.mean(recall)


def f1_score(y_pred,y_true, threshold=0.5):
    f1 , precision , recall = fbeta_score(y_true, y_pred, 1, threshold) #; print('f1 score' , f1)
    return f1 , precision , recall

In [40]:
def train_model(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0  
    epoch_f1 = 0 ; epoch_precision = 0 ; epoch_recall = 0
    model.train()
    ct = 0
    for x, y in iterator:
        optimizer.zero_grad()
        predictions = model(x)
        loss = criterion(predictions, y)
        acc = binary_accuracy(predictions, y)
        f1 , precision , recall = f1_score(predictions , y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1.item() 
        epoch_precision += precision.item()  
        epoch_recall += recall.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator) , epoch_f1/len(iterator), epoch_precision/len(iterator), epoch_recall/len(iterator)

In [41]:
def validate_model(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0 
    epoch_f1 = 0; epoch_precision = 0 ; epoch_recall = 0
    model.eval()
    with torch.no_grad():
        for x ,y  in iterator:

            predictions = model(x)#.squeeze(1)
            loss = criterion(predictions,y)
            acc = binary_accuracy(predictions, y) ; f1 , precision , recall = f1_score(predictions , y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1.item()   ; epoch_precision += precision.item()  ; epoch_recall += recall.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator) , epoch_f1/len(iterator), epoch_precision/len(iterator), epoch_recall/len(iterator)

In [42]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

        
weight_list =  [1/df_data_ratio[df_data_ratio['index']==c]['value'].values[0]  for c in mlb.classes_]
weights = torch.tensor( weight_list)
weights =weights.to(dev)

from torch import optim


### Apply grid search on LR , Weight Decay , Dropout parameters , save the parameters with best f1-score on validatation data .

In [43]:
import random 
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [245]:
best_valid_f1 = -float('inf') ; best_valid_loss = float('inf')
loss_func = nn.BCEWithLogitsLoss(weight=weights) 
loss_func = loss_func.to(dev)
for drp in [0.2, 0.3,0.4,0.5,0.6]:
    for wd in [0.1 , 0.05 , 0.01 , 0.005 , 0.001]:
        for learning_rate in [1e-2 , 5e-3 , 1e-3]:
            model = Model(drp); model.apply(init_weights)
            model = model.to(dev)
            optimizer = optim.Adam(model.parameters() , lr = learning_rate, weight_decay=wd) #[a+'_pred' for a in aspects]
            model = model.to(dev)
            epochs = 10
            for epoch in range(1, epochs + 1):
                train_loss , train_acc , train_f1 , train_precision , train_recall = train_model(model, train_dl, optimizer, loss_func)
                valid_loss , valid_acc , valid_f1 , valid_precision , valid_recall  = validate_model(model, valid_dl, loss_func)
                if (valid_loss < best_valid_loss) & (valid_f1 > best_valid_f1)  & (abs(train_f1- valid_f1) <= 0.05):
                    best_valid_f1 = valid_f1 ; best_valid_loss = valid_loss
                    print('train data' , train_acc , train_f1 , train_precision , train_recall)
                    print('valid data' , valid_acc ,  valid_f1 , valid_precision , valid_recall)


                    print("Parameters: " ,'Dropout: ' ,  drp , 'weight decay: ' ,wd ,' learning rate : ' ,learning_rate )
                    if os.path.isfile('utils/multi_label_problem.pt'):
                        os.remove('utils/multi_label_problem.pt') ; print('chk')
                                           
                    torch.save(model.state_dict(), 'utils/multi_label_problem.pt')

NameError: name 'train' is not defined

In [45]:

loss_func = nn.BCEWithLogitsLoss(weight=weights) 
loss_func = loss_func.to(dev)
model = Model(0.5)
model.load_state_dict(torch.load('utils/multi_label_problem.pt'))
model = model.to(dev)
validate_model(model, valid_dl, loss_func)

(1.7336950500806172,
 0.9144570827484131,
 0.5815974275271097,
 0.6596053044001261,
 0.5424553056557974)

In [46]:
val_preds = []
val_label = []
with torch.no_grad():
    for x ,y  in valid_dl:
        predictions = model(x)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        val_preds.append(preds)
        val_label.append(y.data.cpu().numpy())

# from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score
# print("F1 score",f1_score(np.vstack(val_label)  , np.vstack(val_preds) , average='macro' ))
# print("Precision score",precision_score( np.vstack(val_label)  , np.vstack(val_preds) , average='macro' ))
# print("Recall score",recall_score(np.vstack(val_label)  , np.vstack(val_preds) , average='macro' ))
# print("Accuracy score" , np.mean( np.vstack(val_preds) == np.vstack(val_label)))

In [53]:
mlb.classes_

array(['AMBIENCE', 'DRINKS', 'FOOD', 'LOCATION', 'RESTAURANT', 'SERVICE'],
      dtype=object)

In [59]:
val_aspects['aspects_pred'] = pd.Series(mlb.inverse_transform(np.vstack(val_preds)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
test_preds = []
true_label = []
with torch.no_grad():
    for x ,y  in test_dl:
        predictions = model(x)#.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        test_preds.append(preds)
        true_label.append(y.data.cpu().numpy())

In [62]:
du_multi_aspects['aspects_pred']  = pd.Series(mlb.inverse_transform(np.vstack(test_preds)))

In [63]:
du_multi_aspects.head()

Unnamed: 0,text,aspects2,polarities2,aspects_pred
0,Lange wachttijd.,[SERVICE],[negative],"(RESTAURANT, SERVICE)"
1,"Zelfde dessert, 2 dagen na mekaar.",[FOOD],[negative],"(FOOD,)"
2,Ontbijtbuffet was tip top in orde.,[FOOD],[positive],"(FOOD,)"
3,Niet goedkoop!,[RESTAURANT],[negative],"(RESTAURANT,)"
4,Maar eens in het kasteelrestaurant aangekomen werd het een feest.,[RESTAURANT],[positive],"(RESTAURANT,)"


In [48]:
aspects = mlb.classes_.tolist()
"""
Merging prediction value with original test data and observe the metrics on overall level
"""
dutch_pred = pd.DataFrame(np.vstack(test_preds) ,index=du_multi_aspects.index , columns= [a+'_pred' for a in aspects])
dutch_pred2 = pd.merge(du_multi_aspects, dutch_pred , left_index=True ,right_index = True)

from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score

print("F1 score",f1_score(y_du , dutch_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Accuracy score" , np.mean(y_du == dutch_pred2[[a+'_pred' for a in aspects]].as_matrix()))
print("Precision score",precision_score(y_du , dutch_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Recall score",recall_score(y_du , dutch_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))


F1 score 0.5464933540468624
Accuracy score 0.8764869653252341
Precision score 0.6988147708882922
Recall score 0.48241459832236133


  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]


In [64]:
test_preds = []
true_label = []
with torch.no_grad():
    for x ,y  in test_dl2:
        predictions = model(x)#.squeeze(1)
        rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = rounded_preds.data.cpu().numpy()
        test_preds.append(preds)
        true_label.append(y.data.cpu().numpy())
        

In [65]:
spanish_multi_aspects['aspects_pred']  = pd.Series(mlb.inverse_transform(np.vstack(test_preds)))

In [49]:

        
aspects = mlb.classes_.tolist()
"""
Merging prediction value with original test data and observe the metrics on overall level
"""
spanish_pred = pd.DataFrame(np.vstack(test_preds) ,index=spanish_multi_aspects.index , columns= [a+'_pred' for a in aspects])
spanish_pred2 = pd.merge(spanish_multi_aspects, spanish_pred , left_index=True ,right_index = True)

from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score

print("F1 score",f1_score(y_spainish , spanish_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Accuracy score" , np.mean(y_spainish == spanish_pred2[[a+'_pred' for a in aspects]].as_matrix()))
print("Precision score",precision_score(y_spainish , spanish_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))
print("Recall score",recall_score(y_spainish , spanish_pred2[[a+'_pred' for a in aspects]].as_matrix() , average='macro' ))

F1 score 0.5536945586868443
Accuracy score 0.8819188191881919
Precision score 0.7265248425337454
Recall score 0.47608903361016713




In [107]:
all_pred = dutch_pred2[dutch_pred2.columns[-6:]].as_matrix()
all_true = y_du
true_pos = (all_pred * all_true).sum(axis = 0)
precision = true_pos/all_pred.sum(axis = 0)
recall = true_pos/all_true.sum(axis = 0)
accuracy = all_pred.sum(axis = 0)/ all_true.sum(axis =0)

  """Entry point for launching an IPython kernel.


In [108]:
precision , recall

(array([0.68085106, 0.8490566 , 0.76570048, 0.20930233, 0.66349206,
        0.86646884]),
 array([0.35955056, 0.48913043, 0.57952468, 0.33333333, 0.59206799,
        0.6952381 ]))

In [109]:
(2*precision*recall / (precision + recall))

array([0.47058824, 0.62068966, 0.65972945, 0.25714286, 0.6257485 ,
       0.77146631])

In [77]:
spanish_multi_aspects.head(2)

Unnamed: 0,text,aspects2,polarities2,aspects_pred
0,Nos sentimos muy a gusto.,[RESTAURANT],[positive],[RESTAURANT]
1,"Buen servicio, ambiente Acogedor y tranquilo, comida bien.","[SERVICE, AMBIENCE, FOOD]","[positive, positive, positive]","[AMBIENCE, SERVICE]"


In [76]:
du_multi_aspects.head(2)

Unnamed: 0,text,aspects2,polarities2,aspects_pred
0,Lange wachttijd.,[SERVICE],[negative],"[RESTAURANT, SERVICE]"
1,"Zelfde dessert, 2 dagen na mekaar.",[FOOD],[negative],[FOOD]


In [75]:
val_aspects.head(2)

Unnamed: 0,text,aspects2,polarities2,aspects_pred
0,The lobster sandwich is $24 and although it was good it was not nearly enough to warrant that price.,[FOOD],[positive],[FOOD]
1,"I go out to eat and like my courses, servers are patient and never rush courses or force another drink.",[SERVICE],[positive],[SERVICE]


In [74]:
val_aspects['aspects_pred'] = val_aspects['aspects_pred'].apply(lambda x: list(x))
du_multi_aspects['aspects_pred'] = du_multi_aspects['aspects_pred'].apply(lambda x: list(x))
spanish_multi_aspects['aspects_pred'] = spanish_multi_aspects['aspects_pred'].apply(lambda x: list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [106]:
#train_df = val_aspects.copy()
def ungrp(train_df):
    asp_df = train_df.aspects2.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects2' , 'polarities2' , 'aspects_pred'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    polarity_df = train_df.polarities2.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects2' , 'polarities2' , 'aspects_pred'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    train_df_ungrp = pd.merge(asp_df , polarity_df['value'] , left_index = True , right_index = True ,suffixes=('_aspects' , '_polarities'))
    train_df_ungrp.rename(columns={'value_aspects' : 'aspects' , 'value_polarities':'polarities'} , inplace=True)

    train_df_ungrp2 = pd.merge(train_df_ungrp , train_df[[ 'text' ,'aspects_pred']] , on ='text')
    return train_df_ungrp2
#val_aspects_ungrp = train_df_ungrp2.copy()


In [108]:
val_aspects_ungrp  = ungrp(val_aspects)

In [111]:
du_aspects_ungrp = ungrp(du_multi_aspects)
sp_aspects_ungrp= ungrp(spanish_multi_aspects)

In [114]:
du_aspects_ungrp.shape , sp_aspects_ungrp.shape

((1629, 4), (2321, 4))

In [116]:
def check_ind(x):
    asp = x['aspects_pred']
    if x['aspects'] in asp:
        return 1
    else:
        return 0

In [121]:
val_aspects_ungrp['ind'] = val_aspects_ungrp.apply(lambda x: check_ind(x) , axis = 1)
du_aspects_ungrp['ind'] = du_aspects_ungrp.apply(lambda x: check_ind(x) , axis = 1)
sp_aspects_ungrp['ind'] = sp_aspects_ungrp.apply(lambda x: check_ind(x) , axis = 1)

In [122]:
val_aspects_ungrp.ind.mean() , du_aspects_ungrp.ind.mean() , sp_aspects_ungrp.ind.mean()

(0.7194244604316546, 0.6224677716390423, 0.6432572167169324)

In [128]:
def ungrp2(train_df):
    asp_df = train_df.aspects2.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects2' , 'polarities2'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    polarity_df = train_df.polarities2.apply(pd.Series).merge(train_df , right_index = True , left_index = True)\
    .drop(['aspects2' , 'polarities2'] ,axis = 1)\
    .melt(id_vars = ['text']).drop(['variable'] , axis = 1).dropna()

    train_df_ungrp = pd.merge(asp_df , polarity_df['value'] , left_index = True , right_index = True ,suffixes=('_aspects' , '_polarities'))
    train_df_ungrp.rename(columns={'value_aspects' : 'aspects' , 'value_polarities':'polarities'} , inplace=True)

    return train_df_ungrp
train_aspects_ungrp = ungrp2(train_aspects)

In [129]:
train_aspects_ungrp.head()

Unnamed: 0,text,aspects,polarities
0,"The food was very good, a great deal, and the place its self was great.",AMBIENCE,positive
1,Terrible would be a compliment!,RESTAURANT,negative
2,The place is a lot of fun.,AMBIENCE,positive
3,"However, I think Jeckll and Hydes t is one of those places that is fun to do once.",RESTAURANT,positive
4,The service was friendly and the atmosphere was casual.,SERVICE,positive


In [423]:
val_aspects_ungrp[val_aspects_ungrp['text']=='Everything was wonderful; food, drinks, staff, mileau.']

Unnamed: 0,text,aspects,polarities,aspects_pred,ind
20,"Everything was wonderful; food, drinks, staff, mileau.",SERVICE,positive,"[DRINKS, FOOD]",0
21,"Everything was wonderful; food, drinks, staff, mileau.",DRINKS,positive,"[DRINKS, FOOD]",1
22,"Everything was wonderful; food, drinks, staff, mileau.",RESTAURANT,positive,"[DRINKS, FOOD]",0
23,"Everything was wonderful; food, drinks, staff, mileau.",FOOD,positive,"[DRINKS, FOOD]",1
24,"Everything was wonderful; food, drinks, staff, mileau.",AMBIENCE,positive,"[DRINKS, FOOD]",0


## Sentiment Classification model

In [424]:
val_aspects_ungrp2 = val_aspects_ungrp[val_aspects_ungrp['ind']==1]
du_aspects_ungrp2 = du_aspects_ungrp[du_aspects_ungrp['ind']==1]
sp_aspects_ungrp2 = sp_aspects_ungrp[sp_aspects_ungrp['ind']==1]

In [425]:
val_aspects_ungrp2[['text' , 'aspects' , 'polarities']]

Unnamed: 0,text,aspects,polarities
0,The lobster sandwich is $24 and although it was good it was not nearly enough to warrant that price.,FOOD,positive
1,"I go out to eat and like my courses, servers are patient and never rush courses or force another drink.",SERVICE,positive
2,"Great place, great value.",RESTAURANT,positive
3,Make sure you try this place as often as you can.,RESTAURANT,positive
5,I've had my fair share of modern Japanese and this spot delivers.,FOOD,positive
6,Thank You Emilio.,RESTAURANT,positive
7,"Excellent atmosphere, delicious dishes good and friendly service.",SERVICE,positive
8,"Excellent atmosphere, delicious dishes good and friendly service.",AMBIENCE,positive
9,"Excellent atmosphere, delicious dishes good and friendly service.",FOOD,positive
10,"lobster was good, nothing spectacular.",FOOD,neutral


In [156]:
#"en" ,"nl", 'es'
train_aspects_ungrp[['text']].to_csv('../data/tatoeba/v2/en_resturant.csv' , header = None , index = None , mode = 'w')
val_aspects_ungrp2[['text']].to_csv('../data/tatoeba/v2/en_val.csv' , header = None , index = None , mode = 'w')

du_aspects_ungrp2[['text']].to_csv('../data/tatoeba/v2/nl_resturant.csv' , header = None , index = None , mode = 'w')
sp_aspects_ungrp2[['text']].to_csv('../data/tatoeba/v2/es_resturant.csv' , header = None , index = None , mode = 'w')

In [157]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

DATA_PATH = Path("../data/tatoeba/v2/")
CACHE_PATH = Path("cache2/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 
SPACE_NORMALIZER = re.compile("\s+")
Batch = namedtuple('Batch', 'srcs tokens lengths')

In [158]:
from indexing import IndexLoad, IndexTextOpen, IndexTextQuery, IndexSearchKNN, IndexCreate, IndexSearchMultiple
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

In [159]:
encoder = SentenceEncoder(
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)
bpe_codes = str(MODEL_PATH / "93langs.fcodes")

for lang in ("en" ,"nl", 'es'):  ##"zh" for chinese , nl  for dutch and es for spanish
    Token(
        str(DATA_PATH / f"{lang}_resturant.csv"), ##english_resturant.txt
        str(CACHE_PATH / f"{lang}_resturant.csv"),
        lang=lang,
        romanize=False,
        lower_case=True, gzip=False,
        verbose=True)
    BPEfastApply(
        str(CACHE_PATH / f"{lang}_resturant.csv"),
        str(CACHE_PATH / f"{lang}_resturant.bpe"),
        bpe_codes,
        verbose=True, over_write=True)
    EncodeFile(
        encoder,
        str(CACHE_PATH / f"{lang}_resturant.bpe"),
        str(CACHE_PATH / f"{lang}_resturant.enc"),
        verbose=True, over_write=True)    
    
    
Token(
    str(DATA_PATH / f"en_val.csv"), ##english_resturant.txt
    str(CACHE_PATH / f"en_val.csv"),
    lang=lang,
    romanize=False,
    lower_case=True, gzip=False,
    verbose=True)
BPEfastApply(
    str(CACHE_PATH / f"en_val.csv"),
    str(CACHE_PATH / f"en_val.bpe"),
    bpe_codes,
    verbose=True, over_write=True)
EncodeFile(
    encoder,
    str(CACHE_PATH / f"en_val.bpe"),
    str(CACHE_PATH / f"en_val.enc"),
    verbose=True, over_write=True)    

 - Tokenizer: en_resturant.csv in language en  
 - fast BPE: processing en_resturant.csv
 - Encoder: en_resturant.bpe to en_resturant.enc
 - Encoder: 1656 sentences in 0s
 - Tokenizer: nl_resturant.csv in language nl  
 - fast BPE: processing nl_resturant.csv
 - Encoder: nl_resturant.bpe to nl_resturant.enc
 - Encoder: 1014 sentences in 0s
 - Tokenizer: es_resturant.csv in language es  
 - fast BPE: processing es_resturant.csv
 - Encoder: es_resturant.bpe to es_resturant.enc
 - Encoder: 1493 sentences in 0s
 - Tokenizer: en_val.csv in language es  
 - fast BPE: processing en_val.csv
 - Encoder: en_val.bpe to en_val.enc
 - Encoder: 300 sentences in 0s


In [160]:
train_en, index_tr_en = IndexCreate(
    str(CACHE_PATH / "en_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
val_en, index_val_en = IndexCreate(
    str(CACHE_PATH / "en_val.enc"), 'FlatL2', verbose=True, save_index=False)

data_du, index_du = IndexCreate(
    str(CACHE_PATH / "nl_resturant.enc"), 'FlatL2', verbose=True, save_index=False)
data_spanish, index_spanish = IndexCreate(
    str(CACHE_PATH / "es_resturant.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache2/en_resturant.enc 1656 examples of dim 1024
 - creating FAISS index
 - embedding: cache2/en_val.enc 300 examples of dim 1024
 - creating FAISS index
 - embedding: cache2/nl_resturant.enc 1014 examples of dim 1024
 - creating FAISS index
 - embedding: cache2/es_resturant.enc 1493 examples of dim 1024
 - creating FAISS index


## Create Word embeddings for aspect words

In [142]:
import  pickle
word2vec = pickle.load(open("/data/swati.tiwari/Kaggle/yelp/capability_absa/src/utils/word2vec_google.pkl", 'rb'))


In [146]:
word_embeddings ={}

In [151]:
word_embeddings['FOOD']  = word2vec.get_vector('FOOD')
word_embeddings['RESTAURANT']  = word2vec.get_vector('RESTAURANT')
word_embeddings['SERVICE']  = word2vec.get_vector('SERVICE')
word_embeddings['AMBIENCE']  = word2vec.get_vector('AMBIENCE')
word_embeddings['DRINKS']  = word2vec.get_vector('DRINKS')
word_embeddings['LOCATION']  = word2vec.get_vector('LOCATION')

In [185]:
val_aspects_ungrp2.reset_index(inplace=True , drop= True)
train_aspects_ungrp.reset_index(inplace=True , drop= True)
sp_aspects_ungrp2.reset_index(inplace=True , drop = True)
du_aspects_ungrp2.reset_index(inplace=True , drop = True)

In [190]:
ct_val_en= np.empty((0 , 1324))
for index , row in val_aspects_ungrp2.iterrows():
    #print(index)
    #print(len(val_en[index]))
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((val_en[index] , w2v ) , axis =0 ))
    ct_val_en = np.append(ct_val_en ,[res] , axis = 0 )
     


In [199]:
ct_tr_en= np.empty((0 , 1324))
for index , row in train_aspects_ungrp.iterrows():
    #print(index)
    #print(len(val_en[index]))
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((train_en[index] , w2v ) , axis =0 ))
    ct_tr_en = np.append(ct_tr_en ,[res] , axis = 0 )

print(ct_tr_en.shape)

(1656, 1324)


In [200]:
print(train_aspects_ungrp.shape)

(1656, 3)


In [201]:
ct_du= np.empty((0 , 1324))
for index , row in du_aspects_ungrp2.iterrows():
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((data_du[index] , w2v ) , axis =0 ))
    ct_du = np.append(ct_du ,[res] , axis = 0 )

print(ct_du.shape) ; print(du_aspects_ungrp2.shape)

(1014, 1324)
(1014, 5)


In [202]:
ct_spanish= np.empty((0 , 1324))
for index , row in sp_aspects_ungrp2.iterrows():
    w2v = word_embeddings[row['aspects']]
    res = (np.concatenate((data_spanish[index] , w2v ) , axis =0 ))
    ct_spanish = np.append(ct_spanish ,[res] , axis = 0 )

print(ct_spanish.shape) ; print(sp_aspects_ungrp2.shape)

(1493, 1324)
(1493, 5)


In [204]:
train_aspects_ungrp.polarities.value_counts()

positive    1093
negative    483 
neutral     80  
Name: polarities, dtype: int64

In [227]:
sp_aspects_ungrp2[sp_aspects_ungrp2['polarities']=='conflict']

Unnamed: 0,text,aspects,polarities,aspects_pred,ind
1463,"Es un lugar entrañable de Barcelona que hay que conocer, ambiente inigualable (pero ruidoso, no es para una cena íntima) y cosmopolita.",AMBIENCE,conflict,[AMBIENCE],1


In [238]:
sp_aspects_ungrp2.at[1463 , 'polarities'] = 'neutral'

In [239]:
sp_aspects_ungrp2.iloc[1463]

text            Es un lugar entrañable de Barcelona que hay que conocer, ambiente inigualable (pero ruidoso, no es para una cena íntima) y cosmopolita.
aspects         AMBIENCE                                                                                                                               
polarities      neutral                                                                                                                                
aspects_pred    [AMBIENCE]                                                                                                                             
ind             1                                                                                                                                      
Name: 1463, dtype: object

In [277]:
def change_target(x):
    if x=='positive':
        return 2
    elif x =='negative':
        return 1
    else:
        return 0 

In [278]:
train_aspects_ungrp.head(2)

Unnamed: 0,text,aspects,polarities
0,"The food was very good, a great deal, and the place its self was great.",AMBIENCE,positive
1,Terrible would be a compliment!,RESTAURANT,negative


In [279]:
train_aspects_ungrp['polarities'] = train_aspects_ungrp['polarities'].apply(lambda x: change_target(x))

In [316]:
train_aspects_ungrp.head()

Unnamed: 0,text,aspects,polarities
0,"The food was very good, a great deal, and the place its self was great.",AMBIENCE,2
1,Terrible would be a compliment!,RESTAURANT,1
2,The place is a lot of fun.,AMBIENCE,2
3,"However, I think Jeckll and Hydes t is one of those places that is fun to do once.",RESTAURANT,2
4,The service was friendly and the atmosphere was casual.,SERVICE,2


In [281]:
val_aspects_ungrp2['polarities'] = val_aspects_ungrp2['polarities'].apply(lambda x: change_target(x))
sp_aspects_ungrp2['polarities'] = sp_aspects_ungrp2['polarities'].apply(lambda x: change_target(x))
du_aspects_ungrp2['polarities'] = du_aspects_ungrp2['polarities'].apply(lambda x: change_target(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [282]:
from sklearn.preprocessing import MultiLabelBinarizer
#mlb  = MultiLabelBinarizer()
# tr_eng =  pd.get_dummies(train_aspects_ungrp.polarities).as_matrix() #mlb.fit_transform(train_aspects_ungrp.polarities)
# val_eng = pd.get_dummies(val_aspects_ungrp2.polarities).as_matrix()  #mlb.transform(val_aspects_ungrp2.polarities)
# y_du  = pd.get_dummies(du_aspects_ungrp2.polarities).as_matrix()
# y_spainish  = pd.get_dummies(sp_aspects_ungrp2.polarities).as_matrix()

tr_eng =  train_aspects_ungrp['polarities'].values
val_eng =val_aspects_ungrp2['polarities'].values  #mlb.transform(val_aspects_ungrp2.polarities)
y_du  = du_aspects_ungrp2['polarities'].values
y_spainish  = sp_aspects_ungrp2['polarities'].values
tr_eng.shape , val_eng.shape , y_du.shape , y_spainish.shape

((1656,), (300,), (1014,), (1493,))

In [243]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler().fit(ct_tr_en)
train_std = std_scale.transform(ct_tr_en) 
val_std = std_scale.transform(ct_val_en)
dutch_std = std_scale.transform(ct_du)
spanish_std = std_scale.transform(ct_spanish)

In [283]:
import torch 
import torch
import torch.nn as nn

x_train,y_train,x_valid,y_valid , x_test , y_test  , x_test_sp , y_test_sp = map(torch.FloatTensor, (train_std,tr_eng,  val_std ,\
                                                                            val_eng, dutch_std,y_du, \
                                                                           spanish_std ,y_spainish ))
n,c = x_train.shape
y_train = y_train.type(torch.LongTensor)
y_valid = y_valid.type(torch.LongTensor)
y_test = y_test.type(torch.LongTensor)
y_test_sp = y_test_sp.type(torch.LongTensor)

print(y_train.shape , y_valid.shape , y_test.shape)
print(x_train.shape , x_valid.shape , x_test.shape)
batch_size = 64

torch.Size([1656]) torch.Size([300]) torch.Size([1014])
torch.Size([1656, 1324]) torch.Size([300, 1324]) torch.Size([1014, 1324])


In [284]:
class Model(nn.Module):
    def __init__(self , p):
        super().__init__()
        self.hidden = nn.Linear(1324, 64)
#         self.hidden2 = nn.Linear(512 , 256)
#         self.hidden3 =  nn.Linear(256 , 128)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.fc = nn.Linear(64, 3)

    def forward(self, x):
        x = self.activation(self.dropout(self.hidden(x)))
        x = self.fc(x)
        return x

In [285]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size , shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid )
valid_dl = DataLoader(valid_ds , batch_size= batch_size)

test_ds = TensorDataset(x_test , y_test)
test_dl = DataLoader(test_ds , batch_size=batch_size)

test_ds2 = TensorDataset(x_test_sp , y_test_sp)
test_dl2 = DataLoader(test_ds2 , batch_size=batch_size)

In [286]:
class WrappedDataLoader():
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func
        
    def __len__(self): return len(self.dl)
    
    def __iter__(self):
        batches = iter(self.dl)
        for b in batches: yield(self.func(*b))

In [287]:

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def preprocess(x,y): return x.to(dev),y.to(dev)

train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)
test_dl = WrappedDataLoader(test_dl , preprocess)
test_dl2 = WrappedDataLoader(test_dl2 , preprocess)

In [348]:

# def binary_accuracy(preds, y):
#     rounded_preds = torch.round(torch.sigmoid(preds))
#     correct = (rounded_preds == y).float() #convert into float for division 
#     acc = correct.mean() #/ (len(correct))
#     return acc


def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
#     print('y',y[0])
#     print('pred' , max_preds[0])
    correct = max_preds.squeeze(1).eq(y)   
    return correct.sum() / torch.FloatTensor([y.shape[0]])

def f1_scorepy(preds , y):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    res = f1_score(y.data.cpu().numpy() , max_preds.data.cpu().numpy(), average='macro')  
    prec = precision_score(y.data.cpu().numpy() , max_preds.data.cpu().numpy(), average='macro')  
    rec = recall_score(y.data.cpu().numpy() , max_preds.data.cpu().numpy(), average='macro') 
    return  res , prec , rec


In [349]:
from sklearn.metrics import f1_score , recall_score , precision_score

In [350]:
def train_model(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0  
    epoch_f1 = 0    ; epoch_pr = 0 ; epoch_rec = 0
    model.train()
    ct = 0
    for x, y in iterator:
        optimizer.zero_grad()
        predictions = model(x)
        loss = criterion(predictions, y)
        acc = categorical_accuracy(predictions, y)
        f1 , pr , recall  = f1_scorepy(predictions , y) #; print(f1)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1  ; epoch_pr += pr  ; epoch_rec += recall  
    return epoch_loss / len(iterator), epoch_acc / len(iterator) , epoch_f1/len(iterator), epoch_pr/len(iterator), epoch_rec/len(iterator)

In [351]:
def validate_model(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0 
    epoch_f1 = 0 ; epoch_pr = 0 ; epoch_rec = 0
    model.eval()
    with torch.no_grad():
        for x ,y  in iterator:

            predictions = model(x)#.squeeze(1)
            loss = criterion(predictions,y)
            acc = categorical_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            f1 , pr , recall  = f1_scorepy(predictions , y)
            epoch_f1 += f1  ; epoch_pr += pr  ; epoch_rec += recall  
        
    return epoch_loss / len(iterator), epoch_acc /len(iterator) , epoch_f1/len(iterator), epoch_pr/len(iterator), epoch_rec/len(iterator)

In [352]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

from torch import optim

In [353]:
best_valid_f1 = -float('inf')
loss_func = nn.CrossEntropyLoss()
loss_func = loss_func.to(dev)
drp = 0.5
model = Model(drp);
model.apply(init_weights)
model = model.to(dev)
optimizer = optim.Adam(model.parameters() , lr = 0.005, weight_decay=0.001) #[a+'_pred' for a in aspects]
model = model.to(dev)

  This is separate from the ipykernel package so we can avoid doing imports until


In [354]:
N_EPOCHS = 7
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    train_loss , train_acc , train_f1 , train_precision , train_recall = train_model(model, train_dl, optimizer, loss_func)
    valid_loss , valid_acc , valid_f1 , valid_precision , valid_recall  = validate_model(model, valid_dl, loss_func)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        print('train data' , train_acc , train_f1 , train_precision , train_recall)
        print('valid data' , valid_acc ,  valid_f1 , valid_precision , valid_recall)


        if os.path.isfile('utils/sentiment_classification_problem.pt'):
            os.remove('utils/sentiment_classification_problem.pt') ; print('chk')

        torch.save(model.state_dict(), 'utils/sentiment_classification_problem.pt')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


train data 0.6452609896659851 0.4449486465859487 0.4674156641290686 0.4475313698629599
valid data 0.8167613625526429 0.5541609755675811 0.5991496140341976 0.5494501898849725
chk
train data 0.7827953306528238 0.592721298328579 0.5949976690375594 0.6027124357614713
valid data 0.8244318127632141 0.5562836322510664 0.5578914222006788 0.5602720210328906
chk
train data 0.8270947795647842 0.6582880421863826 0.6795165289306773 0.6625301815217065
valid data 0.8338068127632141 0.5967592887699207 0.6742490696438066 0.5832234139842836
chk


In [414]:
test_preds = np.array([])
true_label = np.array([])
with torch.no_grad():
    for x ,y  in test_dl2:
        predictions = model(x)#.squeeze(1)
        max_preds = predictions.argmax(dim = 1, keepdim = True) 
        #rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = max_preds.data.cpu().numpy()
        test_preds =np.append(test_preds , preds)
       
        true_label = np.append( true_label ,y.data.cpu().numpy())

In [415]:
sp_aspects_ungrp2['polarities_pred']  = test_preds
sp_aspects_ungrp2.polarities_pred = sp_aspects_ungrp2.polarities_pred.astype(int)

du1  = sp_aspects_ungrp2.groupby('text').polarities.apply(lambda x: ' '.join(map (str , x))).reset_index()
du2  = sp_aspects_ungrp2.groupby('text').polarities_pred.apply(lambda x: ' '.join(map (str , x))).reset_index()

sp_sentiment = pd.merge(du1 , du2 , on = ['text'])
sp_sentiment['polarities'] =sp_sentiment['polarities'].apply(lambda x: x.split(' '))
sp_sentiment['polarities_pred'] =sp_sentiment['polarities_pred'].apply(lambda x: x.split(' '))

from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()
tr_eng = mlb.fit_transform(sp_sentiment.polarities)
val_eng = mlb.transform(sp_sentiment.polarities_pred)

print("F1 score",f1_score( tr_eng , val_eng  , average='macro' ))
print("Precision score",precision_score(tr_eng , val_eng  , average='macro' ))
print("Recall score",recall_score(tr_eng , val_eng  , average='macro' ))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


F1 score 0.5663493727657031
Precision score 0.6413590155163188
Recall score 0.5686457428256274


In [419]:
test_preds = np.array([])
true_label = np.array([])
with torch.no_grad():
    for x ,y  in valid_dl:
        predictions = model(x)#.squeeze(1)
        max_preds = predictions.argmax(dim = 1, keepdim = True) 
        #rounded_preds = torch.round(torch.sigmoid(predictions))  #torch.round
        preds = max_preds.data.cpu().numpy()
        test_preds =np.append(test_preds , preds)
       
        true_label = np.append( true_label ,y.data.cpu().numpy())

In [420]:
test_preds.shape

(300,)

In [421]:
val_aspects_ungrp2['polarities_pred']  = test_preds
val_aspects_ungrp2.polarities_pred = val_aspects_ungrp2.polarities_pred.astype(int)

du1  = val_aspects_ungrp2.groupby('text').polarities.apply(lambda x: ' '.join(map (str , x))).reset_index()
du2  = val_aspects_ungrp2.groupby('text').polarities_pred.apply(lambda x: ' '.join(map (str , x))).reset_index()

sp_sentiment = pd.merge(du1 , du2 , on = ['text'])
sp_sentiment['polarities'] =sp_sentiment['polarities'].apply(lambda x: x.split(' '))
sp_sentiment['polarities_pred'] =sp_sentiment['polarities_pred'].apply(lambda x: x.split(' '))

from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()
tr_eng = mlb.fit_transform(sp_sentiment.polarities)
val_eng = mlb.transform(sp_sentiment.polarities_pred)

print("F1 score",f1_score( tr_eng , val_eng  , average='macro' ))
print("Precision score",precision_score(tr_eng , val_eng  , average='macro' ))
print("Recall score",recall_score(tr_eng , val_eng  , average='macro' ))

F1 score 0.6386131309173065
Precision score 0.6697743354369861
Recall score 0.6225255652500234


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [372]:
from sklearn.metrics import f1_score , confusion_matrix , accuracy_score , precision_score , recall_score , roc_auc_score
print("F1 score",f1_score(true_label  , test_preds , average='macro' ))
print("Precision score",precision_score( true_label  , test_preds  , average='macro' ))
print("Recall score",recall_score(true_label  , test_preds  , average='macro' ))
print("Accuracy score" ,accuracy_score(  true_label  , test_preds ))

F1 score 0.5598282394554758
Precision score 0.6457888894523612
Recall score 0.5673783752499005
Accuracy score 0.7731755424063116
